Mercurial > repos > cpt > cpt_protein_blast_grouping
comparison protein_blast_grouping.py @ 0:7abe5f471364 draft
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
author | cpt |
---|---|
date | Wed, 24 Jul 2024 01:37:37 +0000 |
parents | |
children | f2a7dffab581 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:7abe5f471364 |
---|---|
1 import argparse | |
2 import re | |
3 | |
4 | |
5 class BlastProteinResultParser: | |
6 def __init__(self, blast_file): | |
7 self.blast_file = blast_file | |
8 self.results = {} | |
9 | |
10 def parse_blast(self): | |
11 for line in self.blast_file: | |
12 parts = line.strip().split("\t") | |
13 query_id = parts[0] | |
14 subject_titles = parts[2].split("<>") | |
15 for title in subject_titles: | |
16 organism = self.extract_organism(title) | |
17 if organism: | |
18 if organism not in self.results: | |
19 self.results[organism] = { | |
20 "unique_queries": set(), | |
21 "unique_hits": set(), | |
22 } | |
23 # "unique query" == query proteins had a LEAST one match in organism | |
24 self.results[organism]["unique_queries"].add(query_id) | |
25 # "unique hits" == unique proteins from eahc organism were matched by ANY of the queries | |
26 self.results[organism]["unique_hits"].add(parts[1]) | |
27 | |
28 @staticmethod | |
29 def extract_organism(title): | |
30 match = re.search(r"\[(.*?)\]", title) | |
31 return match.group(1) if match else None | |
32 | |
33 def get_top_hits(self, num_hits, key="unique_queries"): | |
34 def sort_key(item): | |
35 return len(item[1][key]) | |
36 | |
37 sorted_results = sorted(self.results.items(), key=sort_key, reverse=True) | |
38 return sorted_results[:num_hits] | |
39 | |
40 def print_results(self, num_hits, sort_key="unique_queries"): | |
41 top_hits = self.get_top_hits(num_hits, sort_key) | |
42 print(f"# Top {num_hits} Hits") | |
43 print( | |
44 "{:<50} {:<25} {:<25}".format( | |
45 "# Name", "Unique Query Matches", "Unique Subject Hits" | |
46 ) | |
47 ) | |
48 for organism, data in top_hits: | |
49 print( | |
50 "{:<50} {:<25} {:<25}".format( | |
51 organism, len(data["unique_queries"]), len(data["unique_hits"]) | |
52 ) | |
53 ) | |
54 | |
55 | |
56 def main(): | |
57 parser = argparse.ArgumentParser( | |
58 description="Parse BLAST results and group by 'top hits' to an organism" | |
59 ) | |
60 parser.add_argument("blast", type=argparse.FileType("r"), help="Blast Results") | |
61 parser.add_argument( | |
62 "--hits", type=int, default=5, help="Number of top hits to display" | |
63 ) | |
64 parser.add_argument( | |
65 "--sort", | |
66 choices=["unique_queries", "unique_hits"], | |
67 default="unique_queries", | |
68 help="Sort results by 'unique_queries' (default) or 'unique_hits'", | |
69 ) | |
70 args = parser.parse_args() | |
71 | |
72 blast_parser = BlastProteinResultParser(args.blast) | |
73 blast_parser.parse_blast() | |
74 blast_parser.print_results(args.hits, args.sort) | |
75 | |
76 | |
77 if __name__ == "__main__": | |
78 main() |