comparison scripts/S02_04_keep_one_hit_from_blast.py @ 0:90b57ab0bd1d draft default tip

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author abims-sbr
date Fri, 01 Feb 2019 10:23:16 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:90b57ab0bd1d
1 #!/usr/bin/env python
2 # coding: utf-8
3 # Author : Victor Mataigne
4
5 import argparse, itertools, pickle
6
7 def list_with_max_score(list_of_hits):
8 """ Among a list of blast hits of the same query, returns the one which has the highest score """
9 max_score = 0
10 ind_max_score = 0
11 i = 0
12
13 for hit in list_of_hits:
14 if float(hit[11]) > max_score:
15 max_score, ind_max_score = hit[11], i
16 i += 1
17
18 return list_of_hits[ind_max_score]
19
20 def main():
21 parser = argparse.ArgumentParser()
22 parser.add_argument('matches', help='diamond blastp output file (tabular) between two species')
23 parser.add_argument('nucleic_file_db', help='Sequence file used as DB for the first blast')
24 parser.add_argument('nucleic_file_q', help='Sequence used as query for the first blast')
25 parser.add_argument('file_subname', help='keyword for output file name')
26 parser.add_argument('step', help='1 or 2 according to which blast has been performed')
27 parser.add_argument('method', choices=['tblastx', 'diamond'], help='alignment tool (tblastx or diamond)')
28 args = parser.parse_args()
29
30 print 'Keeping best hits in {}'.format(args.matches)
31
32 # read tab file in a list (a line = a list elem)
33 list_hits = []
34 dic_hits_common = {} # unique matches for query
35 dic_hits_common_db = {} # unique matches for db
36 with open(args.matches, 'r') as hits:
37 for hit in hits.readlines():
38 h = hit.strip('\n')
39 h2 = h.split('\t')
40 list_hits.append(h2)
41
42 if args.method == 'diamond':
43 if h2[0][0:-6] not in dic_hits_common.keys():
44 dic_hits_common[h2[0][0:-6]] = []
45 if h2[1][0:-6] not in dic_hits_common_db.keys():
46 dic_hits_common_db[h2[1][0:-6]] = []
47
48 elif args.method == 'tblastx':
49 if h2[0] not in dic_hits_common.keys():
50 dic_hits_common[h2[0]] = []
51 if h2[1] not in dic_hits_common_db.keys():
52 dic_hits_common_db[h2[1]] = []
53
54 # Gather in a list of lists elems with common query
55 for hit in list_hits:
56 if args.method == 'diamond':
57 dic_hits_common[hit[0][0:-6]].append(hit)
58 dic_hits_common_db[hit[1][0:-6]].append(hit)
59
60 elif args.method == 'tblastx':
61 dic_hits_common[hit[0]].append(hit)
62 dic_hits_common_db[hit[1]].append(hit)
63
64 # Keep only the best hits in queries
65 list_best_hits_q = []
66 for list_hits in dic_hits_common.values():
67 list_best_hits_q.append(list_with_max_score(list_hits))
68
69 # Keep only the best hits in db
70 list_best_hits_db = []
71 for list_hits in dic_hits_common_db.values():
72 list_best_hits_db.append(list_with_max_score(list_hits))
73
74 del list_hits
75
76 # This dict (exported then with pickle) stores the best hit in the db for the query
77 # A similar dict is built after the second blast, in which query and db are switched
78 # The comparison of the dicts allow to spot RBH
79 dico_best_hits_q = {}
80 for hit in list_best_hits_q:
81 if args.method == 'diamond':
82 dico_best_hits_q[hit[0][0:-6]] = hit[1][0:-6]
83 elif args.method == 'tblastx':
84 dico_best_hits_q[hit[0]] = hit[1]
85
86 n = 'dict_best_hits_from_blast_{}'.format(args.step)
87 pickle_dic_besthits_q = open(n, 'w')
88 pickle.dump(dico_best_hits_q, pickle_dic_besthits_q)
89 pickle_dic_besthits_q.close()
90
91 ## Other temp files :
92
93 # Make big dictionary with initial query fasta file
94 initial_seqs_q = {}
95 with open(args.nucleic_file_q, 'r') as nf :
96 for h, s in itertools.izip_longest(*[nf]*2):
97 header = h.strip('\n')
98 sequence = s.strip('\n')
99 initial_seqs_q[header] = sequence
100
101 # Make big dictionary with initial DB fasta file
102 initial_seqs_db = {}
103 with open(args.nucleic_file_db, 'r') as nf :
104 for h, s in itertools.izip_longest(*[nf]*2):
105 header = h.strip('\n')
106 sequence = s.strip('\n')
107 initial_seqs_db[header] = sequence
108
109 # Write best_hits from query with nucleic sequence in output file
110 name = 'best_hits_q_blast{}_{}'.format(args.step, args.file_subname)
111 output = open(name, 'w')
112 for hit in list_best_hits_q:
113 if args.method == 'diamond':
114 output.write('>'+hit[0][0:-6]+'\n')
115 output.write(initial_seqs_q['>'+hit[0][0:-6]]+'\n')
116 elif args.method == 'tblastx':
117 output.write('>'+hit[0]+'\n')
118 output.write(initial_seqs_q['>'+hit[0]]+'\n')
119 output.close()
120
121 # Write best_hits on db with nucleic sequence in output file
122 name = 'best_hits_db_blast{}_{}'.format(args.step, args.file_subname)
123 output = open(name, 'w')
124 for hit in list_best_hits_db:
125 if args.method == 'diamond':
126 output.write('>'+hit[1][0:-6]+'\n')
127 output.write(initial_seqs_db['>'+hit[1][0:-6]]+'\n')
128 elif args.method == 'tblastx':
129 output.write('>'+hit[1]+'\n')
130 output.write(initial_seqs_db['>'+hit[1]]+'\n')
131
132 output.close()
133
134 del initial_seqs_q
135 del initial_seqs_db
136
137 print 'Done'
138
139 if __name__ == "__main__":
140 main()