comparison scripts/S01_run_first_blast.py @ 0:90b57ab0bd1d draft default tip

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author abims-sbr
date Fri, 01 Feb 2019 10:23:16 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:90b57ab0bd1d
1 #!/usr/bin/env python
2 # coding: utf-8
3 # Author : Victor Mataigne
4
5 import itertools, argparse, os
6
7 """
8 IMPROVMENTS :
9 - Maybe a bit of code factoring
10 - See if it possible to avoid build several times the same db
11 """
12
13 # The script (and S03_run_second_blast.py as well) must be launched with the python '-W ignore' option if tested with planemo
14
15 def main():
16 parser = argparse.ArgumentParser()
17 parser.add_argument('files', help='fasta files separated by commas')
18 parser.add_argument('evalue', help='evalue for blast')
19 parser.add_argument('method', choices=['tblastx', 'diamond'], help='alignment tool (tblastx or diamond)')
20 args = parser.parse_args()
21
22 in_files = args.files.split(',')
23
24 if args.method == 'diamond':
25 in_files_translated = []
26 from Bio.Seq import Seq
27 from Bio.Alphabet import IUPAC
28
29 # From every sequence, make three sequences (translations in the three reading frames)
30 print 'Translating every sequence in all reading frames ...'
31 for file in in_files:
32 name = 'translated_%s' %file
33 in_files_translated.append(name)
34 translated_file = open(name, 'w')
35 with open(file, 'r') as file:
36 for name, seq in itertools.izip_longest(*[file]*2):
37 s = Seq(seq.strip('\n').upper(), IUPAC.ambiguous_dna)
38 translated_file.write(name.strip('\n')+'_orf_1\n')
39 translated_file.write(s.translate()._data+'\n')
40 translated_file.write(name.strip('\n')+'_orf_2\n')
41 translated_file.write(s[1:].translate()._data+'\n')
42 translated_file.write(name.strip('\n')+'_orf_3\n')
43 translated_file.write(s[2:].translate()._data+'\n')
44 translated_file.close()
45
46 # Make the list of all pairwise combinations
47 list_pairwise = itertools.combinations(in_files_translated, 2)
48
49 elif args.method == 'tblastx':
50 list_pairwise = itertools.combinations(in_files, 2)
51
52 else :
53 print 'Mispecified alignment tool'
54 exit()
55
56 os.mkdir('outputs_RBH_dna')
57
58 # Main loop
59
60 if args.method == 'diamond':
61 for pairwise in list_pairwise:
62 print "Pair of species:"
63 print pairwise
64
65 sp1, sp2 = pairwise[0].split('_')[1], pairwise[1].split('_')[1] #rename 'translated_Xx_transcriptom.fasta'
66 sub_directory_name = sp1 + '_' + sp2
67 os.mkdir('./blast_%s' %sub_directory_name)
68
69 print 'Running first blast with Diamond ...'
70
71 # Run diamond
72 os.system('diamond makedb --in %s -d %s >> log_diamond.log' %(pairwise[1], sp2))
73 os.system('diamond blastp -q %s -d %s --max-target-seqs 1 -o matches_blast1_%s -e %s --more-sensitive >> log_diamond.log' %(pairwise[0], sp2, sub_directory_name, args.evalue))
74
75 # tabular output :
76 # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
77
78 a = pairwise[1].replace('translated_', '')
79 b = pairwise[0].replace('translated_', '')
80
81 # There is a chance to have no hits returned
82 if os.path.getsize('matches_blast1_%s' %sub_directory_name) == 0:
83 print 'No hits found. Processing next species pair ...'
84 else :
85
86 # Record only one best_hit per transcript (best of the 6 orfs)
87 os.system('python S02_04_keep_one_hit_from_blast.py matches_blast1_%s %s %s %s %s %s' %(sub_directory_name, a, b, sub_directory_name, '1', args.method))
88
89 # 2d blast with only best hits as db
90 print 'Running second blast with Diamond ... '
91
92 os.system('python -W ignore S03_run_second_blast.py best_hits_db_blast1_%s %s %s %s %s' %(sub_directory_name, pairwise[0], sub_directory_name, args.evalue, args.method))
93
94 # Record only one best_hit per transcript (best of the 6 orfs)
95 os.system('python S02_04_keep_one_hit_from_blast.py matches_blast2_%s %s %s %s %s %s' %(sub_directory_name, b, a, sub_directory_name, '2', args.method))
96
97 # Find Reciprocical Best Hits
98 name1 = 'best_hits_q_blast1_{}'.format(sub_directory_name)
99 name2 = 'best_hits_q_blast2_{}'.format(sub_directory_name)
100 os.system('python S05_find_rbh.py %s %s ' %(name1, name2))
101
102 os.system('mv log_diamond.log ./blast_%s' %sub_directory_name)
103 os.system('rm -f *.dmnd')
104
105 # Those files exist obly if hits were found during the first blast
106 if os.path.getsize('matches_blast1_%s' %sub_directory_name) != 0:
107 os.system('mv *best_hits* ./blast_%s' %sub_directory_name)
108 os.system('mv RBH* outputs_RBH_dna')
109
110 os.system('mv matches_blast* ./blast_%s' %(sub_directory_name))
111
112 os.mkdir('translated_seqs')
113 os.system('mv translated*.fasta ./translated_seqs')
114
115 elif args.method == 'tblastx':
116 for pairwise in list_pairwise:
117 print "Pair of species:"
118 print pairwise
119
120 sp1, sp2 = pairwise[0].split('_')[0], pairwise[1].split('_')[0]
121 sub_directory_name = sp1 + '_' + sp2
122 os.mkdir('./blast_%s' %sub_directory_name)
123
124 print 'Running first tblastx ...'
125
126 # Run diamond
127 os.system('formatdb -i %s -p F -o T >> log_tblastx.log' %(pairwise[1]))
128 os.system('blastall -p tblastx -d %s -i %s -o matches_blast1_%s -T F -e %s -F "mS" -b1 -v1 -K 1 -m 8 >> log_tblastx.log' %(pairwise[1], pairwise[0], sub_directory_name, args.evalue))
129
130 # tabular output :
131 # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
132
133 # There is a chance to have no hits returned
134 if os.path.getsize('matches_blast1_%s' %sub_directory_name) == 0:
135 print 'No hits found. Processing next species pair ...'
136 else:
137
138 # Record only one best_hit per transcript (best of the 6 orfs)
139 os.system('python S02_04_keep_one_hit_from_blast.py matches_blast1_%s %s %s %s %s %s' %(sub_directory_name, pairwise[1], pairwise[0], sub_directory_name, '1', args.method))
140
141 # 2d blast with only best hits as db
142 print 'Running second blast with Diamond ... '
143
144 os.system('python S03_run_second_blast.py best_hits_db_blast1_%s %s %s %s %s' %(sub_directory_name, pairwise[0], sub_directory_name, args.evalue, args.method))
145
146 # Record only one best_hit per transcript (best of the 6 orfs)
147 os.system('python S02_04_keep_one_hit_from_blast.py matches_blast2_%s %s %s %s %s %s' %(sub_directory_name, pairwise[0], pairwise[1], sub_directory_name, '2', args.method))
148
149 # Find Reciprocical Best Hits
150 name1 = 'best_hits_q_blast1_{}'.format(sub_directory_name)
151 name2 = 'best_hits_q_blast2_{}'.format(sub_directory_name)
152 os.system('python S05_find_rbh.py %s %s ' %(name1, name2))
153
154 os.system('mv log_tblastx.log ./blast_%s' %sub_directory_name)
155 os.system('rm -f *.nhr')
156 os.system('rm -f *.nin')
157 os.system('rm -f *.nsd')
158 os.system('rm -f *.nsi')
159 os.system('rm -f *.nsq')
160
161 # Those files exist obly if hits were found during the first blast
162 if os.path.getsize('matches_blast1_%s' %sub_directory_name) != 0:
163 os.system('mv *best_hits* ./blast_%s' %sub_directory_name)
164 os.system('mv RBH* outputs_RBH_dna')
165
166 os.system('mv matches_blast* ./blast_%s' %(sub_directory_name))
167 #os.system('mv matches_blast2_%s ./blast_%s' %(sub_directory_name, sub_directory_name))
168
169 if __name__ == "__main__":
170 main()