comparison cluster_table2krona_format.py @ 19:2f1b5d5c5dd5 draft

Uploaded
author petr-novak
date Tue, 18 May 2021 11:03:57 +0000
parents d14b68e9fd1d
children
comparison
equal deleted inserted replaced
18:d7f3eff34c27 19:2f1b5d5c5dd5
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 import sys 2 import sys
3 import re 3 import re
4 from collections import defaultdict 4 from collections import defaultdict
5 import argparse 5 import argparse
6 import csv
6 7
7 parser = argparse.ArgumentParser() 8 parser = argparse.ArgumentParser()
8 parser.add_argument("-i" ,"--input", type=argparse.FileType('r'), help="path to file CLUSTER_table.csv") 9 parser.add_argument("-i" ,"--input", type=argparse.FileType('r'), help="path to file CLUSTER_table.csv")
9 parser.add_argument("-o" ,"--output", type=argparse.FileType('w'), help="output file name") 10 parser.add_argument("-o" ,"--output", type=argparse.FileType('w'), help="output file name")
10 parser.add_argument("-m", "--use_manual", action='store_true', default=False) 11 parser.add_argument("-m", "--use_manual", action='store_true', default=False)
11 12
12 args = parser.parse_args() 13 args = parser.parse_args()
13 14
14 column = 6 if args.use_manual else 4 15 column = 6 if args.use_manual else 4
15 16 if args.use_manual:
17 annotation="Final_annotation"
18 else:
19 annotation="Automatic_annotation"
16 20
17 header = False 21 header = False
18 clust_info = {} 22 clust_info = {}
19 counts = defaultdict(lambda: 0) 23 counts = defaultdict(lambda: 0)
20 top_clusters = 0 24 top_clusters = 0
21 with open(args.input.name, 'r') as f: 25 with open(args.input.name, 'r') as f:
22 for l in f: 26 csv_reader = csv.reader(f, delimiter = "\t")
23 parts = l.split() 27 for parts in csv_reader:
24 if re.match('.*Cluster.+Supercluster.+Size.+Size_adjusted.+Automatic_annotation.+TAREAN_annotation.+Final_annotation', l): 28 if len(parts) == 0:
25 print("header detected") 29 continue
30 if parts[0] == "Cluster" and parts[1]== "Supercluster":
26 header = True 31 header = True
32 header_columns = parts
33 column = header_columns.index(annotation)
27 continue 34 continue
28 if header: 35 if header:
29 classification = "Top_clusters\t" + "\t".join(parts[column].split("/")[1:]).replace('"','') 36 classification = "Top_clusters\t" + "\t".join(parts[column].split("/")[1:]).replace('"','')
30 counts[classification] += int(parts[3]) 37 counts[classification] += int(parts[3])
31 top_clusters += int(parts[3]) 38 top_clusters += int(parts[3])
39 elif len(parts) >= 2:
40 try:
41 clust_info[parts[0].replace('"', '')] = int(parts[1])
42 except ValueError:
43 pass
32 44
33 elif len(parts) >= 2:
34 clust_info[parts[0].replace('"', '')] = int(parts[1])
35 45
36 counts['Singlets'] = clust_info['Number_of_singlets'] 46 counts['Singlets'] = clust_info['Number_of_singlets']
37 counts['Small_cluster'] = int(clust_info['Number_of_reads_in_clusters']) - top_clusters 47 counts['Small_cluster'] = int(clust_info['Number_of_reads_in_clusters']) - top_clusters
38 48
39 with open(args.output.name, 'w') as fout: 49 with open(args.output.name, 'w') as fout: