Mercurial > repos > cpt > cpt_protein_blast_grouping
annotate protein_blast_grouping.py @ 0:7abe5f471364 draft
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
author | cpt |
---|---|
date | Wed, 24 Jul 2024 01:37:37 +0000 |
parents | |
children | f2a7dffab581 |
rev | line source |
---|---|
0
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
1 import argparse |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
2 import re |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
3 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
4 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
5 class BlastProteinResultParser: |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
6 def __init__(self, blast_file): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
7 self.blast_file = blast_file |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
8 self.results = {} |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
9 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
10 def parse_blast(self): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
11 for line in self.blast_file: |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
12 parts = line.strip().split("\t") |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
13 query_id = parts[0] |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
14 subject_titles = parts[2].split("<>") |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
15 for title in subject_titles: |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
16 organism = self.extract_organism(title) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
17 if organism: |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
18 if organism not in self.results: |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
19 self.results[organism] = { |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
20 "unique_queries": set(), |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
21 "unique_hits": set(), |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
22 } |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
23 # "unique query" == query proteins had a LEAST one match in organism |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
24 self.results[organism]["unique_queries"].add(query_id) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
25 # "unique hits" == unique proteins from eahc organism were matched by ANY of the queries |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
26 self.results[organism]["unique_hits"].add(parts[1]) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
27 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
28 @staticmethod |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
29 def extract_organism(title): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
30 match = re.search(r"\[(.*?)\]", title) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
31 return match.group(1) if match else None |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
32 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
33 def get_top_hits(self, num_hits, key="unique_queries"): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
34 def sort_key(item): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
35 return len(item[1][key]) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
36 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
37 sorted_results = sorted(self.results.items(), key=sort_key, reverse=True) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
38 return sorted_results[:num_hits] |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
39 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
40 def print_results(self, num_hits, sort_key="unique_queries"): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
41 top_hits = self.get_top_hits(num_hits, sort_key) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
42 print(f"# Top {num_hits} Hits") |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
43 print( |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
44 "{:<50} {:<25} {:<25}".format( |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
45 "# Name", "Unique Query Matches", "Unique Subject Hits" |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
46 ) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
47 ) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
48 for organism, data in top_hits: |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
49 print( |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
50 "{:<50} {:<25} {:<25}".format( |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
51 organism, len(data["unique_queries"]), len(data["unique_hits"]) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
52 ) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
53 ) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
54 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
55 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
56 def main(): |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
57 parser = argparse.ArgumentParser( |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
58 description="Parse BLAST results and group by 'top hits' to an organism" |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
59 ) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
60 parser.add_argument("blast", type=argparse.FileType("r"), help="Blast Results") |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
61 parser.add_argument( |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
62 "--hits", type=int, default=5, help="Number of top hits to display" |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
63 ) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
64 parser.add_argument( |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
65 "--sort", |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
66 choices=["unique_queries", "unique_hits"], |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
67 default="unique_queries", |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
68 help="Sort results by 'unique_queries' (default) or 'unique_hits'", |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
69 ) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
70 args = parser.parse_args() |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
71 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
72 blast_parser = BlastProteinResultParser(args.blast) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
73 blast_parser.parse_blast() |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
74 blast_parser.print_results(args.hits, args.sort) |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
75 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
76 |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
77 if __name__ == "__main__": |
7abe5f471364
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
cpt
parents:
diff
changeset
|
78 main() |