Mercurial > repos > cpt > cpt_protein_blast_grouping
diff protein_blast_grouping.py @ 0:7abe5f471364 draft
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
author | cpt |
---|---|
date | Wed, 24 Jul 2024 01:37:37 +0000 |
parents | |
children | f2a7dffab581 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/protein_blast_grouping.py Wed Jul 24 01:37:37 2024 +0000 @@ -0,0 +1,78 @@ +import argparse +import re + + +class BlastProteinResultParser: + def __init__(self, blast_file): + self.blast_file = blast_file + self.results = {} + + def parse_blast(self): + for line in self.blast_file: + parts = line.strip().split("\t") + query_id = parts[0] + subject_titles = parts[2].split("<>") + for title in subject_titles: + organism = self.extract_organism(title) + if organism: + if organism not in self.results: + self.results[organism] = { + "unique_queries": set(), + "unique_hits": set(), + } + # "unique query" == query proteins had a LEAST one match in organism + self.results[organism]["unique_queries"].add(query_id) + # "unique hits" == unique proteins from eahc organism were matched by ANY of the queries + self.results[organism]["unique_hits"].add(parts[1]) + + @staticmethod + def extract_organism(title): + match = re.search(r"\[(.*?)\]", title) + return match.group(1) if match else None + + def get_top_hits(self, num_hits, key="unique_queries"): + def sort_key(item): + return len(item[1][key]) + + sorted_results = sorted(self.results.items(), key=sort_key, reverse=True) + return sorted_results[:num_hits] + + def print_results(self, num_hits, sort_key="unique_queries"): + top_hits = self.get_top_hits(num_hits, sort_key) + print(f"# Top {num_hits} Hits") + print( + "{:<50} {:<25} {:<25}".format( + "# Name", "Unique Query Matches", "Unique Subject Hits" + ) + ) + for organism, data in top_hits: + print( + "{:<50} {:<25} {:<25}".format( + organism, len(data["unique_queries"]), len(data["unique_hits"]) + ) + ) + + +def main(): + parser = argparse.ArgumentParser( + description="Parse BLAST results and group by 'top hits' to an organism" + ) + parser.add_argument("blast", type=argparse.FileType("r"), help="Blast Results") + parser.add_argument( + "--hits", type=int, default=5, help="Number of top hits to display" + ) + parser.add_argument( + "--sort", + choices=["unique_queries", "unique_hits"], + default="unique_queries", + help="Sort results by 'unique_queries' (default) or 'unique_hits'", + ) + args = parser.parse_args() + + blast_parser = BlastProteinResultParser(args.blast) + blast_parser.parse_blast() + blast_parser.print_results(args.hits, args.sort) + + +if __name__ == "__main__": + main()