re_utils: cluster_table2krona_format.py comparison

Uploaded - new tools added

comparison

equal deleted inserted replaced

-:5376e1c9adec
+:d14b68e9fd1d
+#!/usr/bin/env python
+import sys
+import re
+from collections import defaultdict
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument("-i" ,"--input", type=argparse.FileType('r'), help="path to file CLUSTER_table.csv")
+parser.add_argument("-o" ,"--output", type=argparse.FileType('w'), help="output file name")
+parser.add_argument("-m", "--use_manual", action='store_true', default=False)
+args = parser.parse_args()
+column = 6 if args.use_manual else 4
+header = False
+clust_info = {}
+counts = defaultdict(lambda: 0)
+top_clusters = 0
+with open(args.input.name, 'r') as f:
+for l in f:
+parts = l.split()
+if re.match('.*Cluster.+Supercluster.+Size.+Size_adjusted.+Automatic_annotation.+TAREAN_annotation.+Final_annotation', l):
+print("header detected")
+header = True
+continue
+if header:
+classification = "Top_clusters\t" + "\t".join(parts[column].split("/")[1:]).replace('"','')
+counts[classification] += int(parts[3])
+top_clusters += int(parts[3])
+elif len(parts) >= 2:
+clust_info[parts[0].replace('"', '')] = int(parts[1])
+counts['Singlets'] = clust_info['Number_of_singlets']
+counts['Small_cluster'] = int(clust_info['Number_of_reads_in_clusters']) - top_clusters
+with open(args.output.name, 'w') as fout:
+for cls_line, nreads in counts.items():
+fout.write(str(nreads) +"\t" + cls_line + "\n")

Mercurial > repos > petr-novak > re_utils