Mercurial > repos > petr-novak > re_utils
changeset 18:d7f3eff34c27 draft
Uploaded
author | petr-novak |
---|---|
date | Fri, 14 May 2021 11:08:46 +0000 |
parents | d14b68e9fd1d |
children | 2f1b5d5c5dd5 |
files | summarize_cluster_table.R summarize_cluster_table.xml |
diffstat | 2 files changed, 99 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/summarize_cluster_table.R Fri May 14 11:08:46 2021 +0000 @@ -0,0 +1,57 @@ +#!/usr/bin/env Rscript +library(optparse) +option_list <- list( + make_option(c("-c", "--cluster_table"), default=NA, type = "character", + help="file from RepeatExplorer2 clustering - CLUSTER_TABLE.csv"), + + make_option(c("-m", "--comparative_counts"),default = NA,type = "character", + help="file from RepeatExplorer2 output - COMPARATIVE_ANALYSIS_COUNTS.csv"), + make_option(c("-o", "--output"), type="character", + help="output file name") +) + + +opt = parse_args(OptionParser(option_list = option_list)) + +## for testing +cluster_annotation = opt$cluster_table +header_line = grep(".*Cluster.*Supercluster.*Size", readLines(cluster_annotation)) +annot = read.table(cluster_annotation, sep="\t",header=TRUE,as.is=TRUE, skip = header_line - 1) + + +input_read_counts = as.numeric(strsplit( + grep("Number_of_analyzed_reads", + readLines(con=cluster_annotation, n=header_line), + value=TRUE) + ,split="\t")[[1]][2] +) + +## complete classification table: +unique_groups = sort(unique(annot$Final_annotatio)) + +groups_to_remove = grep("contamination|organelle", unique_groups, value=TRUE) +groups_to_keep = unique_groups[!(unique_groups %in% groups_to_remove)] + +if (length(groups_to_remove)>0){ + input_count_reads_corrected = input_read_counts - sum(annot$Size_adjusted[annot$Final_annotation %in% groups_to_remove]) + +}else{ + input_count_reads_corrected = input_read_counts +} + +proportion = numeric() +sum_of_reads = numeric() +for (g in groups_to_keep){ + sum_of_reads[g] = sum(annot$Size_adjusted[annot$Final_annotation %in% g]) + proportion[g] = sum_of_reads[g] / input_count_reads_corrected +} + + + +summary_table = data.frame(Annotation = groups_to_keep, + Number_of_reads = sum_of_reads, + "Proportion[%]" = proportion * 100 , check.names = FALSE) + +print(opt$output) +write.table(summary_table,file = opt$output, + row.names = FALSE, col.names = TRUE, sep="\t")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/summarize_cluster_table.xml Fri May 14 11:08:46 2021 +0000 @@ -0,0 +1,42 @@ +<tool id="summarize_annotation" name="Make summary of CLUSTER_TABLE" version="1.0.0"> + <description> Simple utility to summarize final annotation from RepeatExplorer CLUSTER_TABLE</description> + <requirements> + <requirement type="package">r-optparse</requirement> + </requirements> + + <command interpreter="Rscript" detect_errors="exit_code" > + $__tool_directory__/summarize_cluster_table.R + --cluster_table=$cluster_table + --output=$output + + </command> + + <inputs> + <param format="txt" type="data" name="cluster_table" label="file from RepeatExplorer2 clustering - CLUSTER_TABLE.csv" help="CLUSTER_TABLE.csv must contains completed Final_annotation column" /> + </inputs> + + <outputs> + <data format="tabular" name="output" label="Summary of repeat proportion from ${cluster_table.hid}"/> + </outputs> + <help> + **Summarization of annotation from CLUSTER_TABLE** + + Input file CLUSTER_TABLE.csv must contains filled "Final_annotation" column. Contamination and organelle clusters are discarded from quantification, cluster table must also contain header - see example. + + Example of CLUSTER_TABLE.csv: :: + + Number_of_reads_in_clusters 1185180 + Number_of_clusters 62148 + Number_of_superclusters 62031 + Number_of_singlets 314820 + Number_of_analyzed_reads 1500000 + Cluster Supercluster Size Size_adjusted Automatic_annotation TAREAN_annotation Final_annotation + 10 4 11967 11967 All/repeat/mobile_element/Class_I/LTR/Ty1_copia/SIRE Other All/repeat/mobile_element/Class_I/LTR/Ty1_copia/SIRE + 137 5 2094 2094 All/repeat Other All/repeat + 112 9 3117 3117 All/repeat/rDNA/45S_rDNA Other All/repeat/rDNA/45S_rDNA + 16 11 10078 10078 All/repeat/satellite Putative satellites (high confidence) All/repeat/satellite + 125 22 2630 2630 All/organelle/plastid Other All/organelle/plastid + 124 40 2645 2645 All/repeat/mobile_element/Class_I/LTR/Ty1_copia/Ivana Putative LTR elements All/repeat/mobile_element/Class_I/LTR/Ty1_copia/Ivana + + </help> +</tool>