comparison protein_blast_grouping.py @ 0:7abe5f471364 draft

planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
author cpt
date Wed, 24 Jul 2024 01:37:37 +0000
parents
children f2a7dffab581
comparison
equal deleted inserted replaced
-1:000000000000 0:7abe5f471364
1 import argparse
2 import re
3
4
5 class BlastProteinResultParser:
6 def __init__(self, blast_file):
7 self.blast_file = blast_file
8 self.results = {}
9
10 def parse_blast(self):
11 for line in self.blast_file:
12 parts = line.strip().split("\t")
13 query_id = parts[0]
14 subject_titles = parts[2].split("<>")
15 for title in subject_titles:
16 organism = self.extract_organism(title)
17 if organism:
18 if organism not in self.results:
19 self.results[organism] = {
20 "unique_queries": set(),
21 "unique_hits": set(),
22 }
23 # "unique query" == query proteins had a LEAST one match in organism
24 self.results[organism]["unique_queries"].add(query_id)
25 # "unique hits" == unique proteins from eahc organism were matched by ANY of the queries
26 self.results[organism]["unique_hits"].add(parts[1])
27
28 @staticmethod
29 def extract_organism(title):
30 match = re.search(r"\[(.*?)\]", title)
31 return match.group(1) if match else None
32
33 def get_top_hits(self, num_hits, key="unique_queries"):
34 def sort_key(item):
35 return len(item[1][key])
36
37 sorted_results = sorted(self.results.items(), key=sort_key, reverse=True)
38 return sorted_results[:num_hits]
39
40 def print_results(self, num_hits, sort_key="unique_queries"):
41 top_hits = self.get_top_hits(num_hits, sort_key)
42 print(f"# Top {num_hits} Hits")
43 print(
44 "{:<50} {:<25} {:<25}".format(
45 "# Name", "Unique Query Matches", "Unique Subject Hits"
46 )
47 )
48 for organism, data in top_hits:
49 print(
50 "{:<50} {:<25} {:<25}".format(
51 organism, len(data["unique_queries"]), len(data["unique_hits"])
52 )
53 )
54
55
56 def main():
57 parser = argparse.ArgumentParser(
58 description="Parse BLAST results and group by 'top hits' to an organism"
59 )
60 parser.add_argument("blast", type=argparse.FileType("r"), help="Blast Results")
61 parser.add_argument(
62 "--hits", type=int, default=5, help="Number of top hits to display"
63 )
64 parser.add_argument(
65 "--sort",
66 choices=["unique_queries", "unique_hits"],
67 default="unique_queries",
68 help="Sort results by 'unique_queries' (default) or 'unique_hits'",
69 )
70 args = parser.parse_args()
71
72 blast_parser = BlastProteinResultParser(args.blast)
73 blast_parser.parse_blast()
74 blast_parser.print_results(args.hits, args.sort)
75
76
77 if __name__ == "__main__":
78 main()