annotate cluster_table2krona_format.py @ 19:2f1b5d5c5dd5 draft

Uploaded
author petr-novak
date Tue, 18 May 2021 11:03:57 +0000
parents d14b68e9fd1d
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
17
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
2 import sys
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
3 import re
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
4 from collections import defaultdict
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
5 import argparse
19
2f1b5d5c5dd5 Uploaded
petr-novak
parents: 17
diff changeset
6 import csv
17
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
7
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
8 parser = argparse.ArgumentParser()
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
9 parser.add_argument("-i" ,"--input", type=argparse.FileType('r'), help="path to file CLUSTER_table.csv")
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
10 parser.add_argument("-o" ,"--output", type=argparse.FileType('w'), help="output file name")
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
11 parser.add_argument("-m", "--use_manual", action='store_true', default=False)
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
12
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
13 args = parser.parse_args()
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
14
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
15 column = 6 if args.use_manual else 4
19
2f1b5d5c5dd5 Uploaded
petr-novak
parents: 17
diff changeset
16 if args.use_manual:
2f1b5d5c5dd5 Uploaded
petr-novak
parents: 17
diff changeset
17 annotation="Final_annotation"
2f1b5d5c5dd5 Uploaded
petr-novak
parents: 17
diff changeset
18 else:
2f1b5d5c5dd5 Uploaded
petr-novak
parents: 17
diff changeset
19 annotation="Automatic_annotation"
17
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
20
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
21 header = False
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
22 clust_info = {}
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
23 counts = defaultdict(lambda: 0)
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
24 top_clusters = 0
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
25 with open(args.input.name, 'r') as f:
19
2f1b5d5c5dd5 Uploaded
petr-novak
parents: 17
diff changeset
26 csv_reader = csv.reader(f, delimiter = "\t")
2f1b5d5c5dd5 Uploaded
petr-novak
parents: 17
diff changeset
27 for parts in csv_reader:
2f1b5d5c5dd5 Uploaded
petr-novak
parents: 17
diff changeset
28 if len(parts) == 0:
2f1b5d5c5dd5 Uploaded
petr-novak
parents: 17
diff changeset
29 continue
2f1b5d5c5dd5 Uploaded
petr-novak
parents: 17
diff changeset
30 if parts[0] == "Cluster" and parts[1]== "Supercluster":
17
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
31 header = True
19
2f1b5d5c5dd5 Uploaded
petr-novak
parents: 17
diff changeset
32 header_columns = parts
2f1b5d5c5dd5 Uploaded
petr-novak
parents: 17
diff changeset
33 column = header_columns.index(annotation)
17
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
34 continue
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
35 if header:
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
36 classification = "Top_clusters\t" + "\t".join(parts[column].split("/")[1:]).replace('"','')
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
37 counts[classification] += int(parts[3])
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
38 top_clusters += int(parts[3])
19
2f1b5d5c5dd5 Uploaded
petr-novak
parents: 17
diff changeset
39 elif len(parts) >= 2:
2f1b5d5c5dd5 Uploaded
petr-novak
parents: 17
diff changeset
40 try:
2f1b5d5c5dd5 Uploaded
petr-novak
parents: 17
diff changeset
41 clust_info[parts[0].replace('"', '')] = int(parts[1])
2f1b5d5c5dd5 Uploaded
petr-novak
parents: 17
diff changeset
42 except ValueError:
2f1b5d5c5dd5 Uploaded
petr-novak
parents: 17
diff changeset
43 pass
17
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
44
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
45
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
46 counts['Singlets'] = clust_info['Number_of_singlets']
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
47 counts['Small_cluster'] = int(clust_info['Number_of_reads_in_clusters']) - top_clusters
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
48
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
49 with open(args.output.name, 'w') as fout:
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
50 for cls_line, nreads in counts.items():
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
51 fout.write(str(nreads) +"\t" + cls_line + "\n")
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
52
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
53
d14b68e9fd1d Uploaded - new tools added
petr-novak
parents:
diff changeset
54