Mercurial > repos > peterjc > blast_rbh
comparison tools/blast_rbh/blast_rbh.py @ 4:d8d9a9069586 draft
v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
author | peterjc |
---|---|
date | Wed, 19 Apr 2017 07:44:47 -0400 |
parents | 14b2e159b310 |
children | 8f4500f6f2aa |
comparison
equal
deleted
inserted
replaced
3:9ba8ebb636f4 | 4:d8d9a9069586 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 """BLAST Reciprocal Best Hit (RBH) from two FASTA input files. | 2 """BLAST Reciprocal Best Hit (RBH) from two FASTA input files. |
3 | 3 |
4 Run "blast_rbh.py -h" to see the help text, or read the associated | 4 Run "blast_rbh.py -h" to see the help text, or read the associated |
5 README.rst file which is also available on GitHub at: | 5 blast_rbh.xml and README.rst files which are available on GitHub at: |
6 https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh | 6 https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh |
7 | 7 |
8 This requires Python and the NCBI BLAST+ tools to be installed | 8 This requires Python and the NCBI BLAST+ tools to be installed |
9 and on the $PATH. | 9 and on the $PATH. |
10 | 10 |
16 # TODO - Output more columns, e.g. pident, qcovs, descriptions? | 16 # TODO - Output more columns, e.g. pident, qcovs, descriptions? |
17 # TODO - Use new -qcov_hsp_perc option in BLAST+ 2.2.30 to filter | 17 # TODO - Use new -qcov_hsp_perc option in BLAST+ 2.2.30 to filter |
18 # results, rather than doing minimum HSP coverage in Python. | 18 # results, rather than doing minimum HSP coverage in Python. |
19 # [Not doing this right now as would break on older BLAST+] | 19 # [Not doing this right now as would break on older BLAST+] |
20 | 20 |
21 from __future__ import print_function | |
22 | |
21 import os | 23 import os |
24 import shutil | |
22 import sys | 25 import sys |
23 import tempfile | 26 import tempfile |
24 import shutil | 27 |
25 from optparse import OptionParser | 28 from optparse import OptionParser |
26 | 29 |
27 def stop_err( msg ): | |
28 sys.stderr.write("%s\n" % msg) | |
29 sys.exit(1) | |
30 | 30 |
31 def run(cmd): | 31 def run(cmd): |
32 return_code = os.system(cmd) | 32 return_code = os.system(cmd) |
33 if return_code: | 33 if return_code: |
34 stop_err("Error %i from: %s" % (return_code, cmd)) | 34 sys.exit("Error %i from: %s" % (return_code, cmd)) |
35 | |
35 | 36 |
36 if "--version" in sys.argv[1:]: | 37 if "--version" in sys.argv[1:]: |
37 #TODO - Capture version of BLAST+ binaries too? | 38 # TODO - Capture version of BLAST+ binaries too? |
38 print "BLAST RBH v0.1.6" | 39 print("BLAST RBH v0.1.11") |
39 sys.exit(0) | 40 sys.exit(0) |
40 | 41 |
41 #Parse Command Line | 42 try: |
43 threads = int(os.environ.get("GALAXY_SLOTS", "1")) | |
44 except ValueError: | |
45 threads = 1 | |
46 assert 1 <= threads, threads | |
47 | |
48 # Parse Command Line | |
42 usage = """Use as follows: | 49 usage = """Use as follows: |
43 | 50 |
44 $ python blast_rbh.py [options] A.fasta B.fasta | 51 $ python blast_rbh.py [options] A.fasta B.fasta |
52 | |
53 Many of the options are required. Example with proteins and blastp: | |
54 | |
55 $ python blast_rbh.py -a prot -t blasp -o output.tsv protA.fasta protB.fasta | |
56 | |
57 There is additional guideance in the help text in the blast_rbh.xml file, | |
58 which is shown to the user via the Galaxy interface to this tool. | |
45 """ | 59 """ |
46 | 60 |
47 parser = OptionParser(usage=usage) | 61 parser = OptionParser(usage=usage) |
48 parser.add_option("-a", "--alphabet", dest="dbtype", | 62 parser.add_option("-a", "--alphabet", dest="dbtype", |
49 default=None, | 63 default=None, |
50 help="Alphabet type (nucl or prot)") | 64 help="Alphabet type (nucl or prot; required)") |
51 parser.add_option("-t", "--task", dest="task", | 65 parser.add_option("-t", "--task", dest="task", |
52 default=None, | 66 default=None, |
53 help="BLAST task (e.g. blastp, blastn, megablast)") | 67 help="BLAST task (e.g. blastp, blastn, megablast; required)") |
54 parser.add_option("-i","--identity", dest="min_identity", | 68 parser.add_option("-i", "--identity", dest="min_identity", |
55 default="70", | 69 default="70", |
56 help="Minimum percentage identity (optional, default 70)") | 70 help="Minimum percentage identity (optional, default 70)") |
57 parser.add_option("-c", "--coverage", dest="min_coverage", | 71 parser.add_option("-c", "--coverage", dest="min_coverage", |
58 default="50", | 72 default="50", |
59 help="Minimum HSP coverage (optional, default 50)") | 73 help="Minimum HSP coverage (optional, default 50)") |
60 parser.add_option("--nr", dest="nr", default=False, action="store_true", | 74 parser.add_option("--nr", dest="nr", default=False, action="store_true", |
61 help="Preprocess FASTA files to collapse identifical " | 75 help="Preprocess FASTA files to collapse identitical " |
62 "entries (make sequences non-redundant)") | 76 "entries (make sequences non-redundant)") |
63 parser.add_option("-o", "--output", dest="output", | 77 parser.add_option("-o", "--output", dest="output", |
64 default=None, metavar="FILE", | 78 default=None, metavar="FILE", |
65 help="Output filename") | 79 help="Output filename (required)") |
80 parser.add_option("--threads", dest="threads", | |
81 default=threads, | |
82 help="Number of threads when running BLAST. Defaults to the " | |
83 "$GALAXY_SLOTS environment variable if set, or 1.") | |
66 options, args = parser.parse_args() | 84 options, args = parser.parse_args() |
67 | 85 |
68 if len(args) != 2: | 86 if len(args) != 2: |
69 stop_err("Expects two input FASTA filenames") | 87 sys.exit("Expects two input FASTA filenames") |
70 fasta_a, fasta_b = args | 88 fasta_a, fasta_b = args |
71 if not os.path.isfile(fasta_a): | 89 if not os.path.isfile(fasta_a): |
72 stop_err("Missing input file for species A: %r" % fasta_a) | 90 sys.exit("Missing input file for species A: %r" % fasta_a) |
73 if not os.path.isfile(fasta_b): | 91 if not os.path.isfile(fasta_b): |
74 stop_err("Missing input file for species B: %r" % fasta_b) | 92 sys.exit("Missing input file for species B: %r" % fasta_b) |
75 if os.path.abspath(fasta_a) == os.path.abspath(fasta_b): | 93 if os.path.abspath(fasta_a) == os.path.abspath(fasta_b): |
76 self_comparison = True | 94 self_comparison = True |
77 print("Doing self comparison; ignoring self matches.") | 95 print("Doing self comparison; ignoring self matches.") |
78 else: | 96 else: |
79 self_comparison = False | 97 self_comparison = False |
80 | 98 |
81 if not options.output: | 99 if not options.output: |
82 stop_err("Output filename required, e.g. -o example.tab") | 100 sys.exit("Output filename required, e.g. -o example.tab") |
83 out_file = options.output | 101 out_file = options.output |
84 | 102 |
85 try: | 103 try: |
86 min_identity = float(options.min_identity) | 104 min_identity = float(options.min_identity) |
87 except ValueError: | 105 except ValueError: |
88 stop_err("Expected number between 0 and 100 for minimum identity, not %r" % min_identity) | 106 sys.exit("Expected number between 0 and 100 for minimum identity, not %r" % min_identity) |
89 if not (0 <= min_identity <= 100): | 107 if not (0 <= min_identity <= 100): |
90 stop_err("Expected minimum identity between 0 and 100, not %0.2f" % min_identity) | 108 sys.exit("Expected minimum identity between 0 and 100, not %0.2f" % min_identity) |
91 try: | 109 try: |
92 min_coverage = float(options.min_coverage) | 110 min_coverage = float(options.min_coverage) |
93 except ValueError: | 111 except ValueError: |
94 stop_err("Expected number between 0 and 100 for minimum coverage, not %r" % min_coverage) | 112 sys.exit("Expected number between 0 and 100 for minimum coverage, not %r" % min_coverage) |
95 if not (0 <= min_coverage <= 100): | 113 if not (0 <= min_coverage <= 100): |
96 stop_err("Expected minimum coverage between 0 and 100, not %0.2f" % min_coverage) | 114 sys.exit("Expected minimum coverage between 0 and 100, not %0.2f" % min_coverage) |
97 | 115 |
98 if not options.task: | 116 if not options.task: |
99 stop_err("Missing BLAST task, e.g. -t blastp") | 117 sys.exit("Missing BLAST task, e.g. -t blastp") |
100 blast_type = options.task | 118 blast_type = options.task |
101 | 119 |
102 if not options.dbtype: | 120 if not options.dbtype: |
103 stop_err("Missing database type, -a nucl, or -a prot") | 121 sys.exit("Missing database type, -a nucl, or -a prot") |
104 dbtype = options.dbtype | 122 dbtype = options.dbtype |
105 if dbtype == "nucl": | 123 if dbtype == "nucl": |
106 if blast_type in ["megablast", "blastn", "blastn-short", "dc-megablast"]: | 124 if blast_type in ["megablast", "blastn", "blastn-short", "dc-megablast"]: |
107 blast_cmd = "blastn -task %s" % blast_type | 125 blast_cmd = "blastn -task %s" % blast_type |
108 elif blast_type == "tblastx": | 126 elif blast_type == "tblastx": |
109 blast_cmd = "tblastx" | 127 blast_cmd = "tblastx" |
110 else: | 128 else: |
111 stop_err("Invalid BLAST type for BLASTN: %r" % blast_type) | 129 sys.exit("Invalid BLAST type for BLASTN: %r" % blast_type) |
112 elif dbtype == "prot": | 130 elif dbtype == "prot": |
113 if blast_type not in ["blastp", "blastp-fast", "blastp-short"]: | 131 if blast_type not in ["blastp", "blastp-fast", "blastp-short"]: |
114 stop_err("Invalid BLAST type for BLASTP: %r" % blast_type) | 132 sys.exit("Invalid BLAST type for BLASTP: %r" % blast_type) |
115 blast_cmd = "blastp -task %s" % blast_type | 133 blast_cmd = "blastp -task %s" % blast_type |
116 else: | 134 else: |
117 stop_err("Expected 'nucl' or 'prot' for BLAST database type, not %r" % blast_type) | 135 sys.exit("Expected 'nucl' or 'prot' for BLAST database type, not %r" % blast_type) |
118 | 136 |
119 try: | 137 try: |
120 threads = int(os.environ.get("GALAXY_SLOTS", "1")) | 138 threads = int(options.threads) |
121 except: | 139 except ValueError: |
122 threads = 1 | 140 sys.exit("Expected positive integer for number of threads, not %r" % options.threads) |
123 assert 1 <= threads, threads | 141 if threads < 1: |
142 sys.exit("Expected positive integer for number of threads, not %r" % threads) | |
124 | 143 |
125 makeblastdb_exe = "makeblastdb" | 144 makeblastdb_exe = "makeblastdb" |
126 | 145 |
127 base_path = tempfile.mkdtemp() | 146 base_path = tempfile.mkdtemp() |
128 tmp_a = os.path.join(base_path, "SpeciesA.fasta") | 147 tmp_a = os.path.join(base_path, "SpeciesA.fasta") |
136 tmp_b = os.path.join(base_path, "SpeciesB.fasta") | 155 tmp_b = os.path.join(base_path, "SpeciesB.fasta") |
137 db_b = os.path.join(base_path, "SpeciesB") | 156 db_b = os.path.join(base_path, "SpeciesB") |
138 b_vs_a = os.path.join(base_path, "B_vs_A.tabular") | 157 b_vs_a = os.path.join(base_path, "B_vs_A.tabular") |
139 log = os.path.join(base_path, "blast.log") | 158 log = os.path.join(base_path, "blast.log") |
140 | 159 |
141 cols = "qseqid sseqid bitscore pident qcovhsp qlen length" #Or qcovs? | 160 cols = "qseqid sseqid bitscore pident qcovhsp qlen length" # Or qcovs? |
142 c_query = 0 | 161 c_query = 0 |
143 c_match = 1 | 162 c_match = 1 |
144 c_score = 2 | 163 c_score = 2 |
145 c_identity = 3 | 164 c_identity = 3 |
146 c_coverage = 4 | 165 c_coverage = 4 |
147 c_qlen = 5 | 166 c_qlen = 5 |
148 c_length = 6 | 167 c_length = 6 |
149 | 168 |
150 tie_warning = 0 | 169 tie_warning = 0 |
151 | 170 |
171 | |
152 def best_hits(blast_tabular, ignore_self=False): | 172 def best_hits(blast_tabular, ignore_self=False): |
153 """Iterate over BLAST tabular output, returns best hits as 2-tuples. | 173 """Iterate over BLAST tabular output, returns best hits as 2-tuples. |
154 | 174 |
155 Each return value is (query name, tuple of value for the best hit). | 175 Each return value is (query name, tuple of value for the best hit). |
156 | 176 |
161 """ | 181 """ |
162 global tie_warning | 182 global tie_warning |
163 current = None | 183 current = None |
164 best_score = None | 184 best_score = None |
165 best = None | 185 best = None |
186 col_count = len(cols.split()) | |
166 with open(blast_tabular) as h: | 187 with open(blast_tabular) as h: |
167 for line in h: | 188 for line in h: |
168 if line.startswith("#"): | 189 if line.startswith("#"): |
169 continue | 190 continue |
170 parts = line.rstrip("\n").split("\t") | 191 parts = line.rstrip("\n").split("\t") |
192 if len(parts) != col_count: | |
193 # Using NCBI BLAST+ 2.2.27 the undefined field is ignored | |
194 # Even NCBI BLAST+ 2.5.0 silently ignores unknown fields :( | |
195 sys.exit("Old version of NCBI BLAST? Expected %i columns, got %i:\n%s\n" | |
196 "Note the qcovhsp field was only added in version 2.2.28\n" | |
197 % (col_count, len(parts), line)) | |
171 if float(parts[c_identity]) < min_identity or float(parts[c_coverage]) < min_coverage: | 198 if float(parts[c_identity]) < min_identity or float(parts[c_coverage]) < min_coverage: |
172 continue | 199 continue |
173 a = parts[c_query] | 200 a = parts[c_query] |
174 b = parts[c_match] | 201 b = parts[c_match] |
175 if ignore_self and a == b: | 202 if ignore_self and a == b: |
176 continue | 203 continue |
177 score = float(parts[c_score]) | 204 score = float(parts[c_score]) |
178 qlen = int(parts[c_qlen]) | 205 qlen = int(parts[c_qlen]) |
179 length = int(parts[c_length]) | 206 length = int(parts[c_length]) |
180 #print("Considering hit for %s to %s with score %s..." % (a, b, score)) | 207 # print("Considering hit for %s to %s with score %s..." % (a, b, score)) |
181 if current is None: | 208 if current is None: |
182 #First hit | 209 # First hit |
183 assert best is None | 210 assert best is None |
184 assert best_score is None | 211 assert best_score is None |
185 best = dict() | 212 best = dict() |
186 #Now append this hit... | 213 # Now append this hit... |
187 elif a != current: | 214 elif a != current: |
188 #New hit | 215 # New hit |
189 if len(best) == 1: | 216 if len(best) == 1: |
190 #Unambiguous (no tied matches) | 217 # Unambiguous (no tied matches) |
191 yield current, list(best.values())[0] | 218 yield current, list(best.values())[0] |
192 else: | 219 else: |
193 #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best))) | 220 # print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best))) |
194 tie_warning += 1 | 221 tie_warning += 1 |
195 best = dict() | 222 best = dict() |
196 #Now append this hit... | 223 # Now append this hit... |
197 elif score < best_score: | 224 elif score < best_score: |
198 #print("No improvement for %s, %s < %s" % (a, score, best_score)) | 225 # print("No improvement for %s, %s < %s" % (a, score, best_score)) |
199 continue | 226 continue |
200 elif score > best_score: | 227 elif score > best_score: |
201 #This is better, discard old best | 228 # This is better, discard old best |
202 best = dict() | 229 best = dict() |
203 #Now append this hit... | 230 # Now append this hit... |
204 else: | 231 else: |
205 #print("Tied best hits for %s" % a) | 232 # print("Tied best hits for %s" % a) |
206 assert best_score == score | 233 assert best_score == score |
207 #Now append this hit... | 234 # Now append this hit... |
208 current = a | 235 current = a |
209 best_score = score | 236 best_score = score |
210 #This will collapse two equally good hits to the same target (e.g. duplicated domain) | 237 # This will collapse two equally good hits to the same target (e.g. duplicated domain) |
211 best[b] = (b, score, parts[c_score], parts[c_identity], parts[c_coverage], qlen, length) | 238 best[b] = (b, score, parts[c_score], parts[c_identity], parts[c_coverage], qlen, length) |
212 #Best hit for final query, if unambiguous: | 239 # Best hit for final query, if unambiguous: |
213 if current is not None: | 240 if current is not None: |
214 if len(best)==1: | 241 if len(best) == 1: |
215 yield current, list(best.values())[0] | 242 yield current, list(best.values())[0] |
216 else: | 243 else: |
217 #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best))) | 244 # print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best))) |
218 tie_warning += 1 | 245 tie_warning += 1 |
246 | |
219 | 247 |
220 def check_duplicate_ids(filename): | 248 def check_duplicate_ids(filename): |
221 # Copied from tools/ncbi_blast_plus/check_no_duplicates.py | 249 # Copied from tools/ncbi_blast_plus/check_no_duplicates.py |
222 # TODO - just use Biopython's FASTA parser? | 250 # TODO - just use Biopython's FASTA parser? |
223 if not os.path.isfile(filename): | 251 if not os.path.isfile(filename): |
224 stop_err("Missing FASTA file %r" % filename, 2) | 252 sys.stderr.write("Missing FASTA file %r\n" % filename) |
253 sys.exit(2) | |
225 identifiers = set() | 254 identifiers = set() |
226 handle = open(filename) | 255 handle = open(filename) |
227 for line in handle: | 256 for line in handle: |
228 if line.startswith(">"): | 257 if line.startswith(">"): |
229 # The split will also take care of the new line character, | 258 # The split will also take care of the new line character, |
230 # e.g. ">test\n" and ">test description here\n" both give "test" | 259 # e.g. ">test\n" and ">test description here\n" both give "test" |
231 seq_id = line[1:].split(None, 1)[0] | 260 seq_id = line[1:].split(None, 1)[0] |
232 if seq_id in identifiers: | 261 if seq_id in identifiers: |
233 handle.close() | 262 handle.close() |
234 stop_err("Repeated identifiers, e.g. %r" % seq_id, 3) | 263 sys.stderr.write("Repeated identifiers, e.g. %r\n" % seq_id) |
264 sys.exit(3) | |
235 identifiers.add(seq_id) | 265 identifiers.add(seq_id) |
236 handle.close() | 266 handle.close() |
237 | 267 |
268 | |
238 def make_nr(input_fasta, output_fasta, sep=";"): | 269 def make_nr(input_fasta, output_fasta, sep=";"): |
239 #TODO - seq-hash based to avoid loading everything into RAM? | 270 # TODO - seq-hash based to avoid loading everything into RAM? |
240 by_seq = dict() | 271 by_seq = dict() |
241 try: | 272 try: |
242 from Bio import SeqIO | 273 from Bio import SeqIO |
243 except KeyError: | 274 except KeyError: |
244 stop_err("Missing Biopython") | 275 sys.exit("Missing Biopython") |
245 for record in SeqIO.parse(input_fasta, "fasta"): | 276 for record in SeqIO.parse(input_fasta, "fasta"): |
246 s = str(record.seq).upper() | 277 s = str(record.seq).upper() |
247 try: | 278 try: |
248 by_seq[s].append(record.id) | 279 by_seq[s].append(record.id) |
249 except KeyError: | 280 except KeyError: |
257 duplicates.update(cluster[1:]) | 288 duplicates.update(cluster[1:]) |
258 else: | 289 else: |
259 unique += 1 | 290 unique += 1 |
260 del by_seq | 291 del by_seq |
261 if duplicates: | 292 if duplicates: |
262 #TODO - refactor as a generator with single SeqIO.write(...) call | 293 # TODO - refactor as a generator with single SeqIO.write(...) call |
263 with open(output_fasta, "w") as handle: | 294 with open(output_fasta, "w") as handle: |
264 for record in SeqIO.parse(input_fasta, "fasta"): | 295 for record in SeqIO.parse(input_fasta, "fasta"): |
265 if record.id in representatives: | 296 if record.id in representatives: |
266 cluster = representatives[record.id] | 297 cluster = representatives[record.id] |
267 record.id = sep.join(cluster) | 298 record.id = sep.join(cluster) |
268 record.description = "representing %i records" % len(cluster) | 299 record.description = "representing %i records" % len(cluster) |
269 elif record.id in duplicates: | 300 elif record.id in duplicates: |
270 continue | 301 continue |
271 SeqIO.write(record, handle, "fasta") | 302 SeqIO.write(record, handle, "fasta") |
272 print("%i unique entries; removed %i duplicates leaving %i representative records" % (unique, len(duplicates), len(representatives))) | 303 print("%i unique entries; removed %i duplicates leaving %i representative records" |
304 % (unique, len(duplicates), len(representatives))) | |
273 else: | 305 else: |
274 os.symlink(os.path.abspath(input_fasta), output_fasta) | 306 os.symlink(os.path.abspath(input_fasta), output_fasta) |
275 print("No perfect duplicates in file, %i unique entries" % unique) | 307 print("No perfect duplicates in file, %i unique entries" % unique) |
276 | 308 |
277 #print("Starting...") | 309 |
310 # print("Starting...") | |
278 check_duplicate_ids(fasta_a) | 311 check_duplicate_ids(fasta_a) |
279 if not self_comparison: | 312 if not self_comparison: |
280 check_duplicate_ids(fasta_b) | 313 check_duplicate_ids(fasta_b) |
281 | 314 |
282 if options.nr: | 315 if options.nr: |
284 if not self_comparison: | 317 if not self_comparison: |
285 make_nr(fasta_b, tmp_b) | 318 make_nr(fasta_b, tmp_b) |
286 fasta_a = tmp_a | 319 fasta_a = tmp_a |
287 fasta_b = tmp_b | 320 fasta_b = tmp_b |
288 | 321 |
289 #TODO - Report log in case of error? | 322 # TODO - Report log in case of error? |
290 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log)) | 323 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log)) |
291 if not self_comparison: | 324 if not self_comparison: |
292 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log)) | 325 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log)) |
293 #print("BLAST databases prepared.") | 326 # print("BLAST databases prepared.") |
294 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' | 327 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' |
295 % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads)) | 328 % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads)) |
296 #print("BLAST species A vs species B done.") | 329 # print("BLAST species A vs species B done.") |
297 if not self_comparison: | 330 if not self_comparison: |
298 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' | 331 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' |
299 % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads)) | 332 % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads)) |
300 #print("BLAST species B vs species A done.") | 333 # print("BLAST species B vs species A done.") |
301 | 334 |
302 | 335 |
303 best_b_vs_a = dict(best_hits(b_vs_a, self_comparison)) | 336 best_b_vs_a = dict(best_hits(b_vs_a, self_comparison)) |
304 | 337 |
305 | 338 |
306 count = 0 | 339 count = 0 |
307 outfile = open(out_file, 'w') | 340 outfile = open(out_file, 'w') |
308 outfile.write("#A_id\tB_id\tA_length\tB_length\tA_qcovhsp\tB_qcovhsp\tlength\tpident\tbitscore\n") | 341 outfile.write("#A_id\tB_id\tA_length\tB_length\tA_qcovhsp\tB_qcovhsp\tlength\tpident\tbitscore\n") |
309 for a, (b, a_score_float, a_score_str, a_identity_str, a_coverage_str, a_qlen, a_length) in best_hits(a_vs_b, self_comparison): | 342 for a, (b, a_score_float, a_score_str, |
343 a_identity_str, a_coverage_str, a_qlen, a_length) in best_hits(a_vs_b, self_comparison): | |
310 if b not in best_b_vs_a: | 344 if b not in best_b_vs_a: |
311 #Match b has no best hit | 345 # Match b has no best hit |
312 continue | 346 continue |
313 a2, b_score_float, b_score_str, b_identity_str, b_coverage_str, b_qlen, b_length = best_b_vs_a[b] | 347 a2, b_score_float, b_score_str, b_identity_str, b_coverage_str, b_qlen, b_length = best_b_vs_a[b] |
314 if a != a2: | 348 if a != a2: |
315 #Not an RBH | 349 # Not an RBH |
316 continue | 350 continue |
317 #Start with IDs, lengths, coverage | 351 # Start with IDs, lengths, coverage |
318 values = [a, b, a_qlen, b_qlen, a_coverage_str, b_coverage_str] | 352 values = [a, b, a_qlen, b_qlen, a_coverage_str, b_coverage_str] |
319 #Alignment length was an integer so don't care about original string | 353 # Alignment length was an integer so don't care about original string |
320 values.append(min(a_length, b_length)) | 354 values.append(min(a_length, b_length)) |
321 #Output the original string versions of the scores | 355 # Output the original string versions of the scores |
322 if float(a_identity_str) < float(b_identity_str): | 356 if float(a_identity_str) < float(b_identity_str): |
323 values.append(a_identity_str) | 357 values.append(a_identity_str) |
324 else: | 358 else: |
325 values.append(b_identity_str) | 359 values.append(b_identity_str) |
326 if a_score_float < b_score_float: | 360 if a_score_float < b_score_float: |
328 else: | 362 else: |
329 values.append(b_score_str) | 363 values.append(b_score_str) |
330 outfile.write("%s\t%s\t%i\t%i\t%s\t%s\t%i\t%s\t%s\n" % tuple(values)) | 364 outfile.write("%s\t%s\t%i\t%i\t%s\t%s\t%i\t%s\t%s\n" % tuple(values)) |
331 count += 1 | 365 count += 1 |
332 outfile.close() | 366 outfile.close() |
333 print "Done, %i RBH found" % count | 367 print("Done, %i RBH found" % count) |
334 if tie_warning: | 368 if tie_warning: |
335 sys.stderr.write("Warning: Sequences with tied best hits found, you may have duplicates/clusters\n") | 369 sys.stderr.write("Warning: Sequences with tied best hits found, you may have duplicates/clusters\n") |
336 | 370 |
337 #Remove temp files... | 371 # Remove temp files... |
338 shutil.rmtree(base_path) | 372 shutil.rmtree(base_path) |