Mercurial > repos > petr-novak > re_utils

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/summarize_cluster_table.R	Fri May 14 11:08:46 2021 +0000
@@ -0,0 +1,57 @@
+#!/usr/bin/env Rscript
+library(optparse)
+option_list <- list(
+  make_option(c("-c", "--cluster_table"), default=NA, type = "character",
+              help="file from RepeatExplorer2 clustering - CLUSTER_TABLE.csv"),
+
+  make_option(c("-m", "--comparative_counts"),default = NA,type = "character",
+              help="file from RepeatExplorer2 output - COMPARATIVE_ANALYSIS_COUNTS.csv"),
+  make_option(c("-o", "--output"), type="character",
+              help="output file name")
+)
+
+
+opt = parse_args(OptionParser(option_list = option_list))
+
+## for testing
+cluster_annotation = opt$cluster_table
+header_line = grep(".*Cluster.*Supercluster.*Size", readLines(cluster_annotation))
+annot = read.table(cluster_annotation, sep="\t",header=TRUE,as.is=TRUE, skip = header_line - 1)
+
+
+input_read_counts = as.numeric(strsplit(
+  grep("Number_of_analyzed_reads",
+       readLines(con=cluster_annotation, n=header_line),
+       value=TRUE)
+ ,split="\t")[[1]][2]
+)
+
+## complete classification table:
+unique_groups = sort(unique(annot$Final_annotatio))
+
+groups_to_remove = grep("contamination|organelle", unique_groups, value=TRUE)
+groups_to_keep =  unique_groups[!(unique_groups %in% groups_to_remove)]
+
+if (length(groups_to_remove)>0){
+  input_count_reads_corrected = input_read_counts - sum(annot$Size_adjusted[annot$Final_annotation %in% groups_to_remove])
+
+}else{
+  input_count_reads_corrected = input_read_counts
+}
+
+proportion = numeric()
+sum_of_reads = numeric()
+for (g in groups_to_keep){
+  sum_of_reads[g] = sum(annot$Size_adjusted[annot$Final_annotation %in% g])
+  proportion[g] = sum_of_reads[g] / input_count_reads_corrected
+}
+
+
+
+summary_table = data.frame(Annotation = groups_to_keep,
+                           Number_of_reads = sum_of_reads,
+                           "Proportion[%]" = proportion * 100 , check.names = FALSE)
+
+print(opt$output)
+write.table(summary_table,file = opt$output,
+            row.names = FALSE, col.names = TRUE, sep="\t")
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/summarize_cluster_table.xml	Fri May 14 11:08:46 2021 +0000
@@ -0,0 +1,42 @@
+<tool id="summarize_annotation" name="Make summary of CLUSTER_TABLE" version="1.0.0">
+  <description> Simple utility to summarize final annotation from RepeatExplorer CLUSTER_TABLE</description>
+  <requirements>
+    <requirement type="package">r-optparse</requirement>
+  </requirements>
+
+    <command interpreter="Rscript" detect_errors="exit_code" >
+    $__tool_directory__/summarize_cluster_table.R
+    --cluster_table=$cluster_table
+    --output=$output
+
+    </command>
+
+    <inputs>
+      <param format="txt" type="data" name="cluster_table" label="file from RepeatExplorer2 clustering - CLUSTER_TABLE.csv" help="CLUSTER_TABLE.csv must contains completed Final_annotation column" />
+    </inputs>
+
+    <outputs>
+         	<data format="tabular" name="output" label="Summary of repeat proportion from ${cluster_table.hid}"/>
+    </outputs>
+    <help>
+      **Summarization of annotation from CLUSTER_TABLE**
+
+      Input file CLUSTER_TABLE.csv  must contains filled "Final_annotation" column. Contamination and organelle clusters are discarded from quantification, cluster table must also contain header - see example.
+
+      Example of CLUSTER_TABLE.csv: ::
+
+          Number_of_reads_in_clusters	1185180
+          Number_of_clusters	62148
+          Number_of_superclusters	62031
+          Number_of_singlets	314820
+          Number_of_analyzed_reads	1500000
+          Cluster	Supercluster	Size	Size_adjusted	Automatic_annotation	TAREAN_annotation	Final_annotation
+          10	4	11967	11967	All/repeat/mobile_element/Class_I/LTR/Ty1_copia/SIRE	Other	All/repeat/mobile_element/Class_I/LTR/Ty1_copia/SIRE
+          137	5	2094	2094	All/repeat	Other	All/repeat
+          112	9	3117	3117	All/repeat/rDNA/45S_rDNA	Other	All/repeat/rDNA/45S_rDNA
+          16	11	10078	10078	All/repeat/satellite	Putative satellites (high confidence)	All/repeat/satellite
+          125	22	2630	2630	All/organelle/plastid	Other	All/organelle/plastid
+          124	40	2645	2645	All/repeat/mobile_element/Class_I/LTR/Ty1_copia/Ivana	Putative LTR elements	All/repeat/mobile_element/Class_I/LTR/Ty1_copia/Ivana
+
+    </help>
+</tool>