Mercurial > repos > cpt > cpt_protein_blast_grouping
annotate protein_blast_grouping.py @ 3:c5e0e05ce58a draft
planemo upload commit 120ca2f78836796630f3116ec95ad0326c6e6b6f
author | cpt |
---|---|
date | Thu, 08 Aug 2024 03:33:45 +0000 |
parents | f2a7dffab581 |
children | 258afe02e34a |
rev | line source |
---|---|
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
1 import argparse |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
2 import re |
1
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
3 import sys |
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
4 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
5 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
6 class BlastProteinResultParser: |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
7 def __init__(self, blast_file): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
8 self.blast_file = blast_file |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
9 self.results = {} |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
10 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
11 def parse_blast(self): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
12 for line in self.blast_file: |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
13 parts = line.strip().split("\t") |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
14 query_id = parts[0] |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
15 subject_titles = parts[2].split("<>") |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
16 for title in subject_titles: |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
17 organism = self.extract_organism(title) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
18 if organism: |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
19 if organism not in self.results: |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
20 self.results[organism] = { |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
21 "unique_queries": set(), |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
22 "unique_hits": set(), |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
23 } |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
24 # "unique query" == query proteins had a LEAST one match in organism |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
25 self.results[organism]["unique_queries"].add(query_id) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
26 # "unique hits" == unique proteins from eahc organism were matched by ANY of the queries |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
27 self.results[organism]["unique_hits"].add(parts[1]) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
28 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
29 @staticmethod |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
30 def extract_organism(title): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
31 match = re.search(r"\[(.*?)\]", title) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
32 return match.group(1) if match else None |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
33 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
34 def get_top_hits(self, num_hits, key="unique_queries"): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
35 def sort_key(item): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
36 return len(item[1][key]) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
37 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
38 sorted_results = sorted(self.results.items(), key=sort_key, reverse=True) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
39 return sorted_results[:num_hits] |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
40 |
1
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
41 def print_results( |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
42 self, num_hits, sort_key="unique_queries", output_file=sys.stdout |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
43 ): |
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
44 top_hits = self.get_top_hits(num_hits, sort_key) |
3
c5e0e05ce58a
planemo upload commit 120ca2f78836796630f3116ec95ad0326c6e6b6f
cpt
parents:
1
diff
changeset
|
45 print(f"# Top {num_hits} Hits", file=output_file) |
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
46 print( |
3
c5e0e05ce58a
planemo upload commit 120ca2f78836796630f3116ec95ad0326c6e6b6f
cpt
parents:
1
diff
changeset
|
47 "{:>4} {:<20} {:>10} {:>10}".format( |
c5e0e05ce58a
planemo upload commit 120ca2f78836796630f3116ec95ad0326c6e6b6f
cpt
parents:
1
diff
changeset
|
48 "Rank", "Phage Name", "Unique Query Matches", "Unique Subject Hits" |
c5e0e05ce58a
planemo upload commit 120ca2f78836796630f3116ec95ad0326c6e6b6f
cpt
parents:
1
diff
changeset
|
49 ), |
c5e0e05ce58a
planemo upload commit 120ca2f78836796630f3116ec95ad0326c6e6b6f
cpt
parents:
1
diff
changeset
|
50 file=output_file, |
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
51 ) |
3
c5e0e05ce58a
planemo upload commit 120ca2f78836796630f3116ec95ad0326c6e6b6f
cpt
parents:
1
diff
changeset
|
52 for rank, (organism, data) in enumerate(top_hits): |
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
53 print( |
3
c5e0e05ce58a
planemo upload commit 120ca2f78836796630f3116ec95ad0326c6e6b6f
cpt
parents:
1
diff
changeset
|
54 "{:>4} {:<50} {:<25} {:<25}".format( |
c5e0e05ce58a
planemo upload commit 120ca2f78836796630f3116ec95ad0326c6e6b6f
cpt
parents:
1
diff
changeset
|
55 rank, |
c5e0e05ce58a
planemo upload commit 120ca2f78836796630f3116ec95ad0326c6e6b6f
cpt
parents:
1
diff
changeset
|
56 organism, |
c5e0e05ce58a
planemo upload commit 120ca2f78836796630f3116ec95ad0326c6e6b6f
cpt
parents:
1
diff
changeset
|
57 len(data["unique_queries"]), |
c5e0e05ce58a
planemo upload commit 120ca2f78836796630f3116ec95ad0326c6e6b6f
cpt
parents:
1
diff
changeset
|
58 len(data["unique_hits"]), |
1
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
59 ), |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
60 file=output_file, |
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
61 ) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
62 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
63 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
64 def main(): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
65 parser = argparse.ArgumentParser( |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
66 description="Parse BLAST results and group by 'top hits' to an organism" |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
67 ) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
68 parser.add_argument("blast", type=argparse.FileType("r"), help="Blast Results") |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
69 parser.add_argument( |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
70 "--hits", type=int, default=5, help="Number of top hits to display" |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
71 ) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
72 parser.add_argument( |
1
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
73 "--output", |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
74 type=argparse.FileType("w"), |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
75 default="-", |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
76 help="Output file (default: stdout)", |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
77 ) |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
78 parser.add_argument( |
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
79 "--sort", |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
80 choices=["unique_queries", "unique_hits"], |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
81 default="unique_queries", |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
82 help="Sort results by 'unique_queries' (default) or 'unique_hits'", |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
83 ) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
84 args = parser.parse_args() |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
85 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
86 blast_parser = BlastProteinResultParser(args.blast) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
87 blast_parser.parse_blast() |
1
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
88 blast_parser.print_results(args.hits, args.sort, args.output) |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
89 |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
90 args.output.close() |
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
91 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
92 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
93 if __name__ == "__main__": |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
94 main() |