Mercurial > repos > petr-novak > re_utils
diff cluster_table2krona_format.py @ 17:d14b68e9fd1d draft
Uploaded - new tools added
author | petr-novak |
---|---|
date | Wed, 28 Apr 2021 08:37:20 +0000 |
parents | |
children | 2f1b5d5c5dd5 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster_table2krona_format.py Wed Apr 28 08:37:20 2021 +0000 @@ -0,0 +1,44 @@ +#!/usr/bin/env python +import sys +import re +from collections import defaultdict +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("-i" ,"--input", type=argparse.FileType('r'), help="path to file CLUSTER_table.csv") +parser.add_argument("-o" ,"--output", type=argparse.FileType('w'), help="output file name") +parser.add_argument("-m", "--use_manual", action='store_true', default=False) + +args = parser.parse_args() + +column = 6 if args.use_manual else 4 + + +header = False +clust_info = {} +counts = defaultdict(lambda: 0) +top_clusters = 0 +with open(args.input.name, 'r') as f: + for l in f: + parts = l.split() + if re.match('.*Cluster.+Supercluster.+Size.+Size_adjusted.+Automatic_annotation.+TAREAN_annotation.+Final_annotation', l): + print("header detected") + header = True + continue + if header: + classification = "Top_clusters\t" + "\t".join(parts[column].split("/")[1:]).replace('"','') + counts[classification] += int(parts[3]) + top_clusters += int(parts[3]) + + elif len(parts) >= 2: + clust_info[parts[0].replace('"', '')] = int(parts[1]) + +counts['Singlets'] = clust_info['Number_of_singlets'] +counts['Small_cluster'] = int(clust_info['Number_of_reads_in_clusters']) - top_clusters + +with open(args.output.name, 'w') as fout: + for cls_line, nreads in counts.items(): + fout.write(str(nreads) +"\t" + cls_line + "\n") + + +