Mercurial > repos > petr-novak > re_utils
changeset 19:2f1b5d5c5dd5 draft
Uploaded
author | petr-novak |
---|---|
date | Tue, 18 May 2021 11:03:57 +0000 |
parents | d7f3eff34c27 |
children | 5a05925340b0 |
files | cluster_table2krona_format.py summarize_cluster_table.xml |
diffstat | 2 files changed, 22 insertions(+), 12 deletions(-) [+] |
line wrap: on
line diff
--- a/cluster_table2krona_format.py Fri May 14 11:08:46 2021 +0000 +++ b/cluster_table2krona_format.py Tue May 18 11:03:57 2021 +0000 @@ -3,6 +3,7 @@ import re from collections import defaultdict import argparse +import csv parser = argparse.ArgumentParser() parser.add_argument("-i" ,"--input", type=argparse.FileType('r'), help="path to file CLUSTER_table.csv") @@ -12,26 +13,35 @@ args = parser.parse_args() column = 6 if args.use_manual else 4 - +if args.use_manual: + annotation="Final_annotation" +else: + annotation="Automatic_annotation" header = False clust_info = {} counts = defaultdict(lambda: 0) top_clusters = 0 with open(args.input.name, 'r') as f: - for l in f: - parts = l.split() - if re.match('.*Cluster.+Supercluster.+Size.+Size_adjusted.+Automatic_annotation.+TAREAN_annotation.+Final_annotation', l): - print("header detected") + csv_reader = csv.reader(f, delimiter = "\t") + for parts in csv_reader: + if len(parts) == 0: + continue + if parts[0] == "Cluster" and parts[1]== "Supercluster": header = True + header_columns = parts + column = header_columns.index(annotation) continue if header: classification = "Top_clusters\t" + "\t".join(parts[column].split("/")[1:]).replace('"','') counts[classification] += int(parts[3]) top_clusters += int(parts[3]) + elif len(parts) >= 2: + try: + clust_info[parts[0].replace('"', '')] = int(parts[1]) + except ValueError: + pass - elif len(parts) >= 2: - clust_info[parts[0].replace('"', '')] = int(parts[1]) counts['Singlets'] = clust_info['Number_of_singlets'] counts['Small_cluster'] = int(clust_info['Number_of_reads_in_clusters']) - top_clusters
--- a/summarize_cluster_table.xml Fri May 14 11:08:46 2021 +0000 +++ b/summarize_cluster_table.xml Tue May 18 11:03:57 2021 +0000 @@ -1,5 +1,5 @@ -<tool id="summarize_annotation" name="Make summary of CLUSTER_TABLE" version="1.0.0"> - <description> Simple utility to summarize final annotation from RepeatExplorer CLUSTER_TABLE</description> +<tool id="summarize_annotation" name="Repeat proportions from CLUSTER_TABLE" version="1.0.0"> + <description> Simple utility to summarize final annotations from RepeatExplorer CLUSTER_TABLE</description> <requirements> <requirement type="package">r-optparse</requirement> </requirements> @@ -16,12 +16,12 @@ </inputs> <outputs> - <data format="tabular" name="output" label="Summary of repeat proportion from ${cluster_table.hid}"/> + <data format="tabular" name="output" label="Summary of repeat proportions from ${cluster_table.hid}"/> </outputs> <help> - **Summarization of annotation from CLUSTER_TABLE** + **The tool calculates genome proportions of identified repeats based on cluster annotations in CLUSTER_TABLE.csv** - Input file CLUSTER_TABLE.csv must contains filled "Final_annotation" column. Contamination and organelle clusters are discarded from quantification, cluster table must also contain header - see example. + The column "Final_annotation" must be filled in the input file CLUSTER_TABLE.csv. Contamination and organelle clusters are discarded from quantification. Table header with information about numbers of analyzed reads should remain unchanged - see example below. Example of CLUSTER_TABLE.csv: ::