re_utils: cluster_table2krona_format.py comparison

comparison cluster_table2krona_format.py @ 19:2f1b5d5c5dd5 draft

Uploaded

author	petr-novak
date	Tue, 18 May 2021 11:03:57 +0000
parents	d14b68e9fd1d
children

comparison

equal deleted inserted replaced

-:d7f3eff34c27
+:2f1b5d5c5dd5
 #!/usr/bin/env python
 import sys
 import re
 from collections import defaultdict
 import argparse
+import csv
 parser = argparse.ArgumentParser()
 parser.add_argument("-i" ,"--input", type=argparse.FileType('r'), help="path to file CLUSTER_table.csv")
 parser.add_argument("-o" ,"--output", type=argparse.FileType('w'), help="output file name")
 parser.add_argument("-m", "--use_manual", action='store_true', default=False)
 args = parser.parse_args()
 column = 6 if args.use_manual else 4
+if args.use_manual:
+annotation="Final_annotation"
+else:
+annotation="Automatic_annotation"
 header = False
 clust_info = {}
 counts = defaultdict(lambda: 0)
 top_clusters = 0
 with open(args.input.name, 'r') as f:
-for l in f:
+csv_reader = csv.reader(f, delimiter = "\t")
-parts = l.split()
+for parts in csv_reader:
-if re.match('.*Cluster.+Supercluster.+Size.+Size_adjusted.+Automatic_annotation.+TAREAN_annotation.+Final_annotation', l):
+if len(parts) == 0:
-print("header detected")
+continue
+if parts[0] == "Cluster" and parts[1]== "Supercluster":
 header = True
+header_columns = parts
+column = header_columns.index(annotation)
 continue
 if header:
 classification = "Top_clusters\t" + "\t".join(parts[column].split("/")[1:]).replace('"','')
 counts[classification] += int(parts[3])
 top_clusters += int(parts[3])
+elif len(parts) >= 2:
+try:
+clust_info[parts[0].replace('"', '')] = int(parts[1])
+except ValueError:
+pass
-elif len(parts) >= 2:
-clust_info[parts[0].replace('"', '')] = int(parts[1])
 counts['Singlets'] = clust_info['Number_of_singlets']
 counts['Small_cluster'] = int(clust_info['Number_of_reads_in_clusters']) - top_clusters
 with open(args.output.name, 'w') as fout:

Mercurial > repos > petr-novak > re_utils

comparison cluster_table2krona_format.py @ 19:2f1b5d5c5dd5 draft