Mercurial > repos > cpt > cpt_protein_blast_grouping
annotate protein_blast_grouping.py @ 5:11701cc63084 draft default tip
planemo upload commit 12a58961f0eb41803c1954ac8c2543ed7c56186f
author | cpt |
---|---|
date | Thu, 08 Aug 2024 04:04:26 +0000 (5 months ago) |
parents | 258afe02e34a |
children |
rev | line source |
---|---|
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
1 import argparse |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
2 import re |
1
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
3 import sys |
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
4 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
5 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
6 class BlastProteinResultParser: |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
7 def __init__(self, blast_file): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
8 self.blast_file = blast_file |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
9 self.results = {} |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
10 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
11 def parse_blast(self): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
12 for line in self.blast_file: |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
13 parts = line.strip().split("\t") |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
14 query_id = parts[0] |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
15 subject_titles = parts[2].split("<>") |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
16 for title in subject_titles: |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
17 organism = self.extract_organism(title) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
18 if organism: |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
19 if organism not in self.results: |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
20 self.results[organism] = { |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
21 "unique_queries": set(), |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
22 "unique_hits": set(), |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
23 } |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
24 # "unique query" == query proteins had a LEAST one match in organism |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
25 self.results[organism]["unique_queries"].add(query_id) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
26 # "unique hits" == unique proteins from eahc organism were matched by ANY of the queries |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
27 self.results[organism]["unique_hits"].add(parts[1]) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
28 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
29 @staticmethod |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
30 def extract_organism(title): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
31 match = re.search(r"\[(.*?)\]", title) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
32 return match.group(1) if match else None |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
33 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
34 def get_top_hits(self, num_hits, key="unique_queries"): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
35 def sort_key(item): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
36 return len(item[1][key]) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
37 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
38 sorted_results = sorted(self.results.items(), key=sort_key, reverse=True) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
39 return sorted_results[:num_hits] |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
40 |
1
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
41 def print_results( |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
42 self, num_hits, sort_key="unique_queries", output_file=sys.stdout |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
43 ): |
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
44 top_hits = self.get_top_hits(num_hits, sort_key) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
45 print( |
4
258afe02e34a
planemo upload commit 1098c37fcd6cf35d77748a08aa8b3ee9093c5aea
cpt
parents:
3
diff
changeset
|
46 "Rank\tPhage Name\tUnique Query Matches\tUnique Subject Hits", |
3
c5e0e05ce58a
planemo upload commit 120ca2f78836796630f3116ec95ad0326c6e6b6f
cpt
parents:
1
diff
changeset
|
47 file=output_file, |
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
48 ) |
5
11701cc63084
planemo upload commit 12a58961f0eb41803c1954ac8c2543ed7c56186f
cpt
parents:
4
diff
changeset
|
49 for r, (organism, data) in enumerate(top_hits): |
11701cc63084
planemo upload commit 12a58961f0eb41803c1954ac8c2543ed7c56186f
cpt
parents:
4
diff
changeset
|
50 rank = r + 1 # 1-based ranking |
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
51 print( |
4
258afe02e34a
planemo upload commit 1098c37fcd6cf35d77748a08aa8b3ee9093c5aea
cpt
parents:
3
diff
changeset
|
52 f"{rank}\t{organism}\t{len(data['unique_queries'])}\t{len(data['unique_hits'])}", |
1
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
53 file=output_file, |
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
54 ) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
55 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
56 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
57 def main(): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
58 parser = argparse.ArgumentParser( |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
59 description="Parse BLAST results and group by 'top hits' to an organism" |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
60 ) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
61 parser.add_argument("blast", type=argparse.FileType("r"), help="Blast Results") |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
62 parser.add_argument( |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
63 "--hits", type=int, default=5, help="Number of top hits to display" |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
64 ) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
65 parser.add_argument( |
1
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
66 "--output", |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
67 type=argparse.FileType("w"), |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
68 default="-", |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
69 help="Output file (default: stdout)", |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
70 ) |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
71 parser.add_argument( |
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
72 "--sort", |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
73 choices=["unique_queries", "unique_hits"], |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
74 default="unique_queries", |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
75 help="Sort results by 'unique_queries' (default) or 'unique_hits'", |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
76 ) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
77 args = parser.parse_args() |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
78 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
79 blast_parser = BlastProteinResultParser(args.blast) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
80 blast_parser.parse_blast() |
1
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
81 blast_parser.print_results(args.hits, args.sort, args.output) |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
82 |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
83 args.output.close() |
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
84 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
85 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
86 if __name__ == "__main__": |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
87 main() |