# HG changeset patch # User petr-novak # Date 1621335837 0 # Node ID 2f1b5d5c5dd5f3f0918804fb4dc7bce694a2d7aa # Parent d7f3eff34c275a57f98bcdde6811898f431f3f26 Uploaded diff -r d7f3eff34c27 -r 2f1b5d5c5dd5 cluster_table2krona_format.py --- a/cluster_table2krona_format.py Fri May 14 11:08:46 2021 +0000 +++ b/cluster_table2krona_format.py Tue May 18 11:03:57 2021 +0000 @@ -3,6 +3,7 @@ import re from collections import defaultdict import argparse +import csv parser = argparse.ArgumentParser() parser.add_argument("-i" ,"--input", type=argparse.FileType('r'), help="path to file CLUSTER_table.csv") @@ -12,26 +13,35 @@ args = parser.parse_args() column = 6 if args.use_manual else 4 - +if args.use_manual: + annotation="Final_annotation" +else: + annotation="Automatic_annotation" header = False clust_info = {} counts = defaultdict(lambda: 0) top_clusters = 0 with open(args.input.name, 'r') as f: - for l in f: - parts = l.split() - if re.match('.*Cluster.+Supercluster.+Size.+Size_adjusted.+Automatic_annotation.+TAREAN_annotation.+Final_annotation', l): - print("header detected") + csv_reader = csv.reader(f, delimiter = "\t") + for parts in csv_reader: + if len(parts) == 0: + continue + if parts[0] == "Cluster" and parts[1]== "Supercluster": header = True + header_columns = parts + column = header_columns.index(annotation) continue if header: classification = "Top_clusters\t" + "\t".join(parts[column].split("/")[1:]).replace('"','') counts[classification] += int(parts[3]) top_clusters += int(parts[3]) + elif len(parts) >= 2: + try: + clust_info[parts[0].replace('"', '')] = int(parts[1]) + except ValueError: + pass - elif len(parts) >= 2: - clust_info[parts[0].replace('"', '')] = int(parts[1]) counts['Singlets'] = clust_info['Number_of_singlets'] counts['Small_cluster'] = int(clust_info['Number_of_reads_in_clusters']) - top_clusters diff -r d7f3eff34c27 -r 2f1b5d5c5dd5 summarize_cluster_table.xml --- a/summarize_cluster_table.xml Fri May 14 11:08:46 2021 +0000 +++ b/summarize_cluster_table.xml Tue May 18 11:03:57 2021 +0000 @@ -1,5 +1,5 @@ - - Simple utility to summarize final annotation from RepeatExplorer CLUSTER_TABLE + + Simple utility to summarize final annotations from RepeatExplorer CLUSTER_TABLE r-optparse @@ -16,12 +16,12 @@ - + - **Summarization of annotation from CLUSTER_TABLE** + **The tool calculates genome proportions of identified repeats based on cluster annotations in CLUSTER_TABLE.csv** - Input file CLUSTER_TABLE.csv must contains filled "Final_annotation" column. Contamination and organelle clusters are discarded from quantification, cluster table must also contain header - see example. + The column "Final_annotation" must be filled in the input file CLUSTER_TABLE.csv. Contamination and organelle clusters are discarded from quantification. Table header with information about numbers of analyzed reads should remain unchanged - see example below. Example of CLUSTER_TABLE.csv: ::