Mercurial > repos > cpt > cpt_protein_blast_grouping
annotate protein_blast_grouping.py @ 1:f2a7dffab581 draft
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
author | cpt |
---|---|
date | Thu, 08 Aug 2024 02:46:54 +0000 |
parents | 7abe5f471364 |
children | c5e0e05ce58a |
rev | line source |
---|---|
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
1 import argparse |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
2 import re |
1
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
3 import sys |
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
4 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
5 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
6 class BlastProteinResultParser: |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
7 def __init__(self, blast_file): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
8 self.blast_file = blast_file |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
9 self.results = {} |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
10 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
11 def parse_blast(self): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
12 for line in self.blast_file: |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
13 parts = line.strip().split("\t") |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
14 query_id = parts[0] |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
15 subject_titles = parts[2].split("<>") |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
16 for title in subject_titles: |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
17 organism = self.extract_organism(title) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
18 if organism: |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
19 if organism not in self.results: |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
20 self.results[organism] = { |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
21 "unique_queries": set(), |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
22 "unique_hits": set(), |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
23 } |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
24 # "unique query" == query proteins had a LEAST one match in organism |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
25 self.results[organism]["unique_queries"].add(query_id) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
26 # "unique hits" == unique proteins from eahc organism were matched by ANY of the queries |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
27 self.results[organism]["unique_hits"].add(parts[1]) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
28 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
29 @staticmethod |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
30 def extract_organism(title): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
31 match = re.search(r"\[(.*?)\]", title) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
32 return match.group(1) if match else None |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
33 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
34 def get_top_hits(self, num_hits, key="unique_queries"): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
35 def sort_key(item): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
36 return len(item[1][key]) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
37 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
38 sorted_results = sorted(self.results.items(), key=sort_key, reverse=True) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
39 return sorted_results[:num_hits] |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
40 |
1
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
41 def print_results( |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
42 self, num_hits, sort_key="unique_queries", output_file=sys.stdout |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
43 ): |
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
44 top_hits = self.get_top_hits(num_hits, sort_key) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
45 print(f"# Top {num_hits} Hits") |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
46 print( |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
47 "{:<50} {:<25} {:<25}".format( |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
48 "# Name", "Unique Query Matches", "Unique Subject Hits" |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
49 ) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
50 ) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
51 for organism, data in top_hits: |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
52 print( |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
53 "{:<50} {:<25} {:<25}".format( |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
54 organism, len(data["unique_queries"]), len(data["unique_hits"]) |
1
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
55 ), |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
56 file=output_file, |
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
57 ) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
58 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
59 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
60 def main(): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
61 parser = argparse.ArgumentParser( |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
62 description="Parse BLAST results and group by 'top hits' to an organism" |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
63 ) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
64 parser.add_argument("blast", type=argparse.FileType("r"), help="Blast Results") |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
65 parser.add_argument( |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
66 "--hits", type=int, default=5, help="Number of top hits to display" |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
67 ) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
68 parser.add_argument( |
1
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
69 "--output", |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
70 type=argparse.FileType("w"), |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
71 default="-", |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
72 help="Output file (default: stdout)", |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
73 ) |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
74 parser.add_argument( |
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
75 "--sort", |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
76 choices=["unique_queries", "unique_hits"], |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
77 default="unique_queries", |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
78 help="Sort results by 'unique_queries' (default) or 'unique_hits'", |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
79 ) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
80 args = parser.parse_args() |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
81 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
82 blast_parser = BlastProteinResultParser(args.blast) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
83 blast_parser.parse_blast() |
1
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
84 blast_parser.print_results(args.hits, args.sort, args.output) |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
85 |
f2a7dffab581
planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents:
0
diff
changeset
|
86 args.output.close() |
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
87 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
88 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
89 if __name__ == "__main__": |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
90 main() |