annotate protein_blast_grouping.py @ 1:f2a7dffab581 draft

planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
author cpt
date Thu, 08 Aug 2024 02:46:54 +0000
parents 7abe5f471364
children c5e0e05ce58a
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
1 import argparse
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
2 import re
1
f2a7dffab581 planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents: 0
diff changeset
3 import sys
0
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
4
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
5
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
6 class BlastProteinResultParser:
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
7 def __init__(self, blast_file):
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
8 self.blast_file = blast_file
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
9 self.results = {}
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
10
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
11 def parse_blast(self):
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
12 for line in self.blast_file:
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
13 parts = line.strip().split("\t")
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
14 query_id = parts[0]
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
15 subject_titles = parts[2].split("<>")
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
16 for title in subject_titles:
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
17 organism = self.extract_organism(title)
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
18 if organism:
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
19 if organism not in self.results:
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
20 self.results[organism] = {
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
21 "unique_queries": set(),
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
22 "unique_hits": set(),
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
23 }
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
24 # "unique query" == query proteins had a LEAST one match in organism
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
25 self.results[organism]["unique_queries"].add(query_id)
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
26 # "unique hits" == unique proteins from eahc organism were matched by ANY of the queries
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
27 self.results[organism]["unique_hits"].add(parts[1])
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
28
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
29 @staticmethod
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
30 def extract_organism(title):
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
31 match = re.search(r"\[(.*?)\]", title)
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
32 return match.group(1) if match else None
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
33
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
34 def get_top_hits(self, num_hits, key="unique_queries"):
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
35 def sort_key(item):
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
36 return len(item[1][key])
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
37
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
38 sorted_results = sorted(self.results.items(), key=sort_key, reverse=True)
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
39 return sorted_results[:num_hits]
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
40
1
f2a7dffab581 planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents: 0
diff changeset
41 def print_results(
f2a7dffab581 planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents: 0
diff changeset
42 self, num_hits, sort_key="unique_queries", output_file=sys.stdout
f2a7dffab581 planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents: 0
diff changeset
43 ):
0
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
44 top_hits = self.get_top_hits(num_hits, sort_key)
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
45 print(f"# Top {num_hits} Hits")
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
46 print(
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
47 "{:<50} {:<25} {:<25}".format(
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
48 "# Name", "Unique Query Matches", "Unique Subject Hits"
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
49 )
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
50 )
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
51 for organism, data in top_hits:
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
52 print(
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
53 "{:<50} {:<25} {:<25}".format(
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
54 organism, len(data["unique_queries"]), len(data["unique_hits"])
1
f2a7dffab581 planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents: 0
diff changeset
55 ),
f2a7dffab581 planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents: 0
diff changeset
56 file=output_file,
0
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
57 )
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
58
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
59
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
60 def main():
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
61 parser = argparse.ArgumentParser(
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
62 description="Parse BLAST results and group by 'top hits' to an organism"
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
63 )
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
64 parser.add_argument("blast", type=argparse.FileType("r"), help="Blast Results")
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
65 parser.add_argument(
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
66 "--hits", type=int, default=5, help="Number of top hits to display"
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
67 )
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
68 parser.add_argument(
1
f2a7dffab581 planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents: 0
diff changeset
69 "--output",
f2a7dffab581 planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents: 0
diff changeset
70 type=argparse.FileType("w"),
f2a7dffab581 planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents: 0
diff changeset
71 default="-",
f2a7dffab581 planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents: 0
diff changeset
72 help="Output file (default: stdout)",
f2a7dffab581 planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents: 0
diff changeset
73 )
f2a7dffab581 planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents: 0
diff changeset
74 parser.add_argument(
0
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
75 "--sort",
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
76 choices=["unique_queries", "unique_hits"],
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
77 default="unique_queries",
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
78 help="Sort results by 'unique_queries' (default) or 'unique_hits'",
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
79 )
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
80 args = parser.parse_args()
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
81
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
82 blast_parser = BlastProteinResultParser(args.blast)
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
83 blast_parser.parse_blast()
1
f2a7dffab581 planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents: 0
diff changeset
84 blast_parser.print_results(args.hits, args.sort, args.output)
f2a7dffab581 planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents: 0
diff changeset
85
f2a7dffab581 planemo upload commit 6dde4ec93f27f36a10017393063dcf0568f1d405
cpt
parents: 0
diff changeset
86 args.output.close()
0
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
87
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
88
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
89 if __name__ == "__main__":
7abe5f471364 planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff changeset
90 main()