Repository 're_utils'
hg clone https://toolshed.g2.bx.psu.edu/repos/petr-novak/re_utils

Changeset 18:d7f3eff34c27 (2021-05-14)
Previous changeset 17:d14b68e9fd1d (2021-04-28) Next changeset 19:2f1b5d5c5dd5 (2021-05-18)
Commit message:
Uploaded
added:
summarize_cluster_table.R
summarize_cluster_table.xml
b
diff -r d14b68e9fd1d -r d7f3eff34c27 summarize_cluster_table.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/summarize_cluster_table.R Fri May 14 11:08:46 2021 +0000
[
@@ -0,0 +1,57 @@
+#!/usr/bin/env Rscript
+library(optparse)
+option_list <- list( 
+  make_option(c("-c", "--cluster_table"), default=NA, type = "character",
+              help="file from RepeatExplorer2 clustering - CLUSTER_TABLE.csv"),
+
+  make_option(c("-m", "--comparative_counts"),default = NA,type = "character",
+              help="file from RepeatExplorer2 output - COMPARATIVE_ANALYSIS_COUNTS.csv"),
+  make_option(c("-o", "--output"), type="character",
+              help="output file name")
+)
+
+
+opt = parse_args(OptionParser(option_list = option_list))
+
+## for testing
+cluster_annotation = opt$cluster_table
+header_line = grep(".*Cluster.*Supercluster.*Size", readLines(cluster_annotation))
+annot = read.table(cluster_annotation, sep="\t",header=TRUE,as.is=TRUE, skip = header_line - 1)
+
+
+input_read_counts = as.numeric(strsplit(
+  grep("Number_of_analyzed_reads",
+       readLines(con=cluster_annotation, n=header_line),
+       value=TRUE)
+ ,split="\t")[[1]][2]
+)
+
+## complete classification table:
+unique_groups = sort(unique(annot$Final_annotatio))
+
+groups_to_remove = grep("contamination|organelle", unique_groups, value=TRUE)
+groups_to_keep =  unique_groups[!(unique_groups %in% groups_to_remove)]
+
+if (length(groups_to_remove)>0){
+  input_count_reads_corrected = input_read_counts - sum(annot$Size_adjusted[annot$Final_annotation %in% groups_to_remove])
+
+}else{
+  input_count_reads_corrected = input_read_counts 
+}
+
+proportion = numeric()
+sum_of_reads = numeric()
+for (g in groups_to_keep){
+  sum_of_reads[g] = sum(annot$Size_adjusted[annot$Final_annotation %in% g])
+  proportion[g] = sum_of_reads[g] / input_count_reads_corrected
+}
+
+
+
+summary_table = data.frame(Annotation = groups_to_keep,
+                           Number_of_reads = sum_of_reads,
+                           "Proportion[%]" = proportion * 100 , check.names = FALSE)
+
+print(opt$output)
+write.table(summary_table,file = opt$output,
+            row.names = FALSE, col.names = TRUE, sep="\t")
b
diff -r d14b68e9fd1d -r d7f3eff34c27 summarize_cluster_table.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/summarize_cluster_table.xml Fri May 14 11:08:46 2021 +0000
b
@@ -0,0 +1,42 @@
+<tool id="summarize_annotation" name="Make summary of CLUSTER_TABLE" version="1.0.0">
+  <description> Simple utility to summarize final annotation from RepeatExplorer CLUSTER_TABLE</description>
+  <requirements>
+    <requirement type="package">r-optparse</requirement>
+  </requirements>
+  
+    <command interpreter="Rscript" detect_errors="exit_code" >
+    $__tool_directory__/summarize_cluster_table.R
+    --cluster_table=$cluster_table
+    --output=$output
+
+    </command>
+
+    <inputs>
+      <param format="txt" type="data" name="cluster_table" label="file from RepeatExplorer2 clustering - CLUSTER_TABLE.csv" help="CLUSTER_TABLE.csv must contains completed Final_annotation column" />
+    </inputs>
+
+    <outputs>
+          <data format="tabular" name="output" label="Summary of repeat proportion from ${cluster_table.hid}"/>
+    </outputs>
+    <help>
+      **Summarization of annotation from CLUSTER_TABLE**
+          
+      Input file CLUSTER_TABLE.csv  must contains filled "Final_annotation" column. Contamination and organelle clusters are discarded from quantification, cluster table must also contain header - see example.
+
+      Example of CLUSTER_TABLE.csv: ::
+
+          Number_of_reads_in_clusters 1185180
+          Number_of_clusters 62148
+          Number_of_superclusters 62031
+          Number_of_singlets 314820
+          Number_of_analyzed_reads 1500000
+          Cluster Supercluster Size Size_adjusted Automatic_annotation TAREAN_annotation Final_annotation
+          10 4 11967 11967 All/repeat/mobile_element/Class_I/LTR/Ty1_copia/SIRE Other All/repeat/mobile_element/Class_I/LTR/Ty1_copia/SIRE
+          137 5 2094 2094 All/repeat Other All/repeat
+          112 9 3117 3117 All/repeat/rDNA/45S_rDNA Other All/repeat/rDNA/45S_rDNA
+          16 11 10078 10078 All/repeat/satellite Putative satellites (high confidence) All/repeat/satellite
+          125 22 2630 2630 All/organelle/plastid Other All/organelle/plastid
+          124 40 2645 2645 All/repeat/mobile_element/Class_I/LTR/Ty1_copia/Ivana Putative LTR elements All/repeat/mobile_element/Class_I/LTR/Ty1_copia/Ivana
+
+    </help>
+</tool>